Data Pre-processing

In this section we create a full matrix of items and users by utilizing collaborative filtering technique.

# Install required R packages
library(recommenderlab)
## Loading required package: Matrix
## Loading required package: arules
## 
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
## 
##     abbreviate, write
## Loading required package: proxy
## 
## Attaching package: 'proxy'
## The following object is masked from 'package:Matrix':
## 
##     as.matrix
## The following objects are masked from 'package:stats':
## 
##     as.dist, dist
## The following object is masked from 'package:base':
## 
##     as.matrix
## Loading required package: registry
library(reshape2)
library(ggplot2)
library(reshape2)
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:reshape2':
## 
##     dcast, melt
##Read the given Online retail Dataset from the saved location
UCIUnique<-read.csv("/Users/uzma/Downloads/ml-latest-small/Online Retail.csv",header=TRUE)

#Summary of the Transactional Data
summary(UCIUnique)
##    InvoiceNo        StockCode     
##  573585 :  1114   85099B :  1157  
##  581219 :   749   85123A :  1033  
##  581492 :   731   22086  :   995  
##  580729 :   721   23203  :   980  
##  558475 :   705   22423  :   969  
##  579777 :   687   23084  :   886  
##  (Other):299420   (Other):298107  
##                              Description        Quantity        
##  JUMBO BAG RED RETROSPOT           :  1159   Min.   :-80995.00  
##  WHITE HANGING HEART T-LIGHT HOLDER:  1065   1st Qu.:     1.00  
##  PAPER CHAIN KIT 50'S CHRISTMAS    :   995   Median :     4.00  
##  REGENCY CAKESTAND 3 TIER          :   971   Mean   :     9.83  
##  RABBIT NIGHT LIGHT                :   871   3rd Qu.:    12.00  
##  LUNCH BAG RED RETROSPOT           :   846   Max.   : 80995.00  
##  (Other)                           :298220                      
##          InvoiceDate       UnitPrice            CustomerID   
##  10/31/11 14:41:  1114   Min.   :-11062.060   Min.   :12347  
##  12/8/11 9:28  :   749   1st Qu.:     1.250   1st Qu.:13975  
##  12/9/11 10:03 :   731   Median :     2.080   Median :15156  
##  12/5/11 17:24 :   721   Mean   :     4.252   Mean   :15288  
##  6/29/11 15:58 :   705   3rd Qu.:     4.130   3rd Qu.:16775  
##  11/30/11 15:13:   687   Max.   : 17836.460   Max.   :18287  
##  (Other)       :299420                        NA's   :33714  
##            Country      
##  United Kingdom:276922  
##  EIRE          :  5338  
##  Germany       :  5327  
##  France        :  4979  
##  Spain         :  1386  
##  Switzerland   :  1304  
##  (Other)       :  8871
head(UCIUnique)
##   InvoiceNo StockCode                         Description Quantity
## 1    563756     22440       BALLOON WATER BOMB PACK OF 35       20
## 2    563756     21878 PACK OF 6 SANDCASTLE FLAGS ASSORTED       12
## 3    563756     22607         WOODEN ROUNDERS GARDEN SET         2
## 4    563756     22549                    PICTURE DOMINOES       12
## 5    563756     21791  VINTAGE HEADS AND TAILS CARD GAME        12
## 6    563756     21888                           BINGO SET        4
##     InvoiceDate UnitPrice CustomerID     Country
## 1 8/19/11 11:08      0.42      12418 Switzerland
## 2 8/19/11 11:08      0.85      12418 Switzerland
## 3 8/19/11 11:08      9.95      12418 Switzerland
## 4 8/19/11 11:08      1.45      12418 Switzerland
## 5 8/19/11 11:08      1.25      12418 Switzerland
## 6 8/19/11 11:08      3.75      12418 Switzerland
##Create a dataframe that contains calculated utility values along with Item and customer Ids
tmp1<-cbind.data.frame(ItemID=UCIUnique$StockCode,CustomerID=UCIUnique$CustomerID,Utility=UCIUnique$Quantity*UCIUnique$UnitPrice)

##Aggregate Utility values for the same item and customer (repeated observations) because utility is a scalar
tmp11<-aggregate(Utility ~ ItemID + CustomerID,data=tmp1,FUN=sum)
tmp2<-unique(tmp11)

##Create another data frame for profit margin
Profmar<-cbind.data.frame(ItemID=UCIUnique$StockCode,ProfitMargin=UCIUnique$UnitPrice*0.25,Quantity=UCIUnique$Quantity)
Profmar1<-unique(Profmar)

##Create a recommender algorithm for a subset of 26 users and 970 items: created because we picked up first 1500 observations from the dataset
x<-tmp2[1:1500,]
finaltemporary<-acast(x,x$CustomerID ~ x$ItemID,value.var="Utility")

## use collaborative filtering to get full matrix
## Confirm the class
class(finaltemporary)
## [1] "matrix"
# Convert it to matrix
Fnewdata<-as.matrix(finaltemporary)
Fnewdata[is.na(Fnewdata)] = 0

## Rescale each column to range between 0 and 5
Fnewdata<-apply(Fnewdata, MARGIN = 2, FUN = function(X) ((X - min(X))*5)/diff(range(X)))
Fnewdata[Fnewdata==0]<-NA

# Convert this into realRatingMatrix data structure
# This will be a recommenderlab sparse-matrix like data-structure
recomdata1 <- as(Fnewdata, "realRatingMatrix")

##User Based Collaborative Filtering (UBCF)
rec11=Recommender(recomdata1[1:nrow(recomdata1)],method="POPULAR")

##Predict the unknown ratings
recom11 <- predict(rec11, recomdata1[1:nrow(recomdata1)], type="ratings")

# Convert prediction into a complete matrix
#as(recom1, "list")
recomfinal1<-as(recom11, "matrix") # Is full of ratings. NAs disappear
recomfinal1[is.na(recomfinal1)] = 0

##Scale back to 0-5 range
recomfinal1<-apply(recomfinal1, MARGIN = 2, FUN = function(X) ((X - min(X))*5)/diff(range(X)))

##Superimpose existing and predicted to get the full matrix
Fnewdata[is.na(Fnewdata)] = 0
recomfinal1[is.na(recomfinal1)] = 0
for(i in 1:nrow(recomfinal1)){
  for (j in 1:ncol(recomfinal1)){
    if (recomfinal1[i,j]==0){
      recomfinal1[i,j]<-Fnewdata[i,j]
    }
  }
}

##recomfinal1 is the matrix we will use for greedy algorithm
##Size of the assortment to be tested is denoted by NF
NF<-8
##Create a matrix with total number of rows= customer number in recomfinal1 and columns=alpha value+z value+items #picked+user number+system time
library(reshape2)
library(data.table)
dframeoutput<- data.frame(matrix(ncol = 5+NF, nrow = 0))
#x31 <- c("Replication number","Customer ID", "alpha", "z",as.list("Items") ,"System Time")
x31 <- c("Replication number","Customer ID", "alpha", "z" ,"System Time")
colnames(dframeoutput) <- x31
colnames(dframeoutput[,6:(5+NF)])<-"Items"
BFdataframe<- data.frame(matrix(ncol = 5+NF, nrow = 0))
#x31 <- c("Replication number","Customer ID", "alpha", "z",as.list("Items") ,"System Time")
y31 <- c("Replication number","Customer ID", "alpha", "z" ,"System Time")
colnames(BFdataframe) <- y31
colnames(BFdataframe[,6:(5+NF)])<-"Items"

##Run a loop to pick N items for each user and set five replications
#set.seed(1)
##Alpha values
t1<-c(0.01,0.1,1)
#z values
t2<-c(0,1,4)
#Start replications
##Initialize
s1<-1
u<-1
# r is for replications, l is for users
for(r in 1:2){
   for (l in 1:nrow(recomfinal1)){
     for(z1 in t2){
        for(alpha in t1){
        set.seed(r)
        testset<-sample(recomfinal1[l,], NF, replace = FALSE, prob = NULL)
##Reshape for this user
        long <- melt(testset)
#Rownames are item ids -- create a column for those
        setDT(long, keep.rownames = TRUE)[]
##From Profmargin data frame merge the revenue
        colnames(long)[1]<-"ItemID"
        colnames(long)[2]<-"Utility"
##For loop to identify profit margins for each item
        longtemp <- data.frame(matrix(ncol = 3, nrow = 0))
        x <- c("ItemID", "Utility", "Margin")
        colnames(longtemp) <- x
## Final data set for greedy and bruteforce
        longtemp<-merge(long, Profmar1, by="ItemID")
## Calulate mean of all profit margins for the same item because of different countries they are sold in
        longtemp1<-aggregate(.~ItemID, data=longtemp, mean)
##sytem time
old <- Sys.time() # get start time
Items = c(1:NF) 
Revenue=longtemp1$ProfitMargin
Preference.weight=exp(longtemp1$Utility)
ItemID=longtemp1$ItemID
dataf = data.frame(Items, Revenue, Preference.weight)#,ItemID) 
### Complete Code and Output for Greedy Algorithm
#Declare variables
n<-max(dataf$Items, na.rm = TRUE)
Q1<-matrix(0,n,n)
V0<-vector(,n)
item_index<-vector(,n)
f1<-vector(,n+1)
#Compute No-Choice
for (j in 1:n){
V0[j]<-exp((j-z1)*alpha)}
#Start Algorithm A1
k=1
f1[1]<-0
while(k<=n){
  
            for(i in 1:n){
              if (k==1){Q1[i,k]<-(dataf$Revenue[i]*dataf$Preference.weight[i])/(dataf$Preference.weight[i]+V0[k])
              }
              else
                  { 
                    Q1[i,k]<-((dataf$Revenue[i]*dataf$Preference.weight[i])/(P1f+dataf$Preference.weight[i]+V0[k])
                        +(P2f/(P1f+dataf$Preference.weight[i]+V0[k]))) }
            }
           for(z in item_index){Q1[z,k]<-0}
           f1[k+1]<-max(Q1[,k],na.rm=FALSE)
           
#Identify highest expected revenue and corresponding item
           if(f1[k+1]>f1[k]){
             item_index[k]<-which.max(Q1[,k])
             P1=NULL
             P2=NULL
             P1f=NULL
             P2f=NULL
             for(j in item_index){ P1[j]<-(dataf$Preference.weight[j])
             P2[j]<-(dataf$Preference.weight[j]*dataf$Revenue[j])
             }
             P1f<-sum(P1,na.rm=TRUE)
             P2f<-sum(P2,na.rm=TRUE)
              k=k+1  }
           else
             {#cat("Assortment returned by Algorithm 1 = ", item_index, "\n")
             break}
}
# print elapsed time
new <- Sys.time() - old # calculate difference
y<-rownames(recomfinal1)[l]
#y1<-(item_index)
y1<-rep(0,NF)
for(p1 in 1:NF){
  if(item_index[p1]>0){y1[p1]<-as.character(longtemp1[item_index[p1],1])}
  else{y1[p1]<-0}
}
dframeoutput[s1,5]<-new
dframeoutput[s1,1]<-r
dframeoutput[s1,2]<-y
dframeoutput[s1,3]<-alpha
dframeoutput[s1,4]<-z1
dframeoutput[s1,6:(5+NF)]<-y1
s1<-s1+1

##Brute force algorithm for the same replication for every user
# Brute force--all 2^n assortments
#Use the function to generate all assortments and calculate their expected revenues
# Create a vector to store all revenues
old1 <- Sys.time() # get start time
d=NULL
PW=NULL
s = n;
res <- sapply(0:(s-1),function(x)rep(c(rep(0,2^x),rep(1,2^x)),2^(s-x-1)))
t<-2^n
E=matrix(0,t,n)
for(i in 1:t){
  d<-rowSums(res)
  for(j in 1:n){
    if(res[i,j]>0){
      PW[j]<-dataf$Preference.weight[j]
    }
    else
    {PW[j]<-0}
    }
    PWSum<-sum(PW,na.rm=TRUE)
    for (k in 1:n){
      if(res[i,k]>0){
    E[i,k]<-(dataf$Revenue[k]*dataf$Preference.weight[k])/(PWSum+V0[d[i]])}
      else{E[i,k]<-0}
        }
 }
Final<-rowSums(E)
#max(Final)
y3<-rep(0,NF)
y5<-rep(0,NF)
y4<-as.integer(E[which.max(Final),]>0)
for(p in 1:NF){
  if (y4[p]=="1"){y3[p]<-p}
  else{y3[p]<-0}
}
for(p2 in 1:NF){
  if(y3[p2]>0){y5[p2]<-as.character(longtemp1[y3[p2],1])}
  else{y5[p2]<-0}
}
#BFSolution<-which( E[which.max(Final),]>0, arr.ind=TRUE)
#cat("Assortment returned by brute force = ", BFSolution, "\n")
new1 <- Sys.time() - old1 # calculate difference
q<-rownames(recomfinal1)[l]
#y2<-(BFSolution)
BFdataframe[u,5]<-new1
BFdataframe[u,1]<-r
BFdataframe[u,2]<-q
BFdataframe[u,3]<-alpha
BFdataframe[u,4]<-z1
BFdataframe[u,6:(5+NF)]<-y5
u<-u+1
}
  }
     }
}


## What items were selected
##Brute force
head(BFdataframe)
##   Replication number Customer ID alpha z System Time NA NA NA NA NA NA NA
## 1                  1       12347  0.01 0  0.05548811  0  0  0  0  0  0  0
## 2                  1       12347  0.10 0  0.06352782  0  0  0  0  0  0  0
## 3                  1       12347  1.00 0  0.04884505  0  0  0  0  0  0  0
## 4                  1       12347  0.01 1  0.06206989  0  0  0  0  0  0  0
## 5                  1       12347  0.10 1  0.04942608  0  0  0  0  0  0  0
## 6                  1       12347  1.00 1  0.04765201  0  0  0  0  0  0  0
##       NA
## 1 84078A
## 2 84078A
## 3 84078A
## 4 84078A
## 5 84078A
## 6 84078A
tail(BFdataframe)
##     Replication number Customer ID alpha z System Time NA NA NA NA NA NA
## 481                  2       12384  0.01 1  0.04899287  0  0  0  0  0  0
## 482                  2       12384  0.10 1  0.05241203  0  0  0  0  0  0
## 483                  2       12384  1.00 1  0.04559612  0  0  0  0  0  0
## 484                  2       12384  0.01 4  0.04828596  0  0  0  0  0  0
## 485                  2       12384  0.10 4  0.05352497  0  0  0  0  0  0
## 486                  2       12384  1.00 4  0.05048203  0  0  0  0  0  0
##         NA NA
## 481 84078A  0
## 482 84078A  0
## 483 84078A  0
## 484 84078A  0
## 485 84078A  0
## 486 84078A  0
##greedy
head(dframeoutput)
##   Replication number Customer ID alpha z System Time     NA NA NA NA NA NA
## 1                  1       12347  0.01 0 0.001432896 84078A  0  0  0  0  0
## 2                  1       12347  0.10 0 0.001550913 84078A  0  0  0  0  0
## 3                  1       12347  1.00 0 0.001390934 84078A  0  0  0  0  0
## 4                  1       12347  0.01 1 0.001423836 84078A  0  0  0  0  0
## 5                  1       12347  0.10 1 0.001372814 84078A  0  0  0  0  0
## 6                  1       12347  1.00 1 0.001473904 84078A  0  0  0  0  0
##   NA NA
## 1  0  0
## 2  0  0
## 3  0  0
## 4  0  0
## 5  0  0
## 6  0  0
tail(dframeoutput)
##     Replication number Customer ID alpha z System Time     NA NA NA NA NA
## 481                  2       12384  0.01 1 0.001315117 84078A  0  0  0  0
## 482                  2       12384  0.10 1 0.001347065 84078A  0  0  0  0
## 483                  2       12384  1.00 1 0.001333952 84078A  0  0  0  0
## 484                  2       12384  0.01 4 0.001329184 84078A  0  0  0  0
## 485                  2       12384  0.10 4 0.001461983 84078A  0  0  0  0
## 486                  2       12384  1.00 4 0.001428127 84078A  0  0  0  0
##     NA NA NA
## 481  0  0  0
## 482  0  0  0
## 483  0  0  0
## 484  0  0  0
## 485  0  0  0
## 486  0  0  0
#whattoselect<-which(rownames(dataf$ItemID))