Amazon Book Sales Cluster

#load data
setwd("~/Desktop/Network Analytics")
network_data_subset <-read.table(file = "graph_subset_rank1000.txt")
names(network_data_subset) <- c("from", "to")

#convert data
network_data_subset <- 
  graph_from_data_frame(network_data_subset, directed = FALSE, vertices = NULL)

#Plotting the network

plot.igraph(
  network_data_subset,
  layout = layout.kamada.kawai,
  vertex.color = 'blue',
  vertex.label = NA,
  vertex.size = 4,
  edge.curved = .1,
  asp = -1,
  margin = -.03
)
title("Network Data Plot",
  cex.main = .75,
  col.main = "Black")

#commments - there seems to be a large gap in the bottom left quadrant of the network data plot. The network data plot also seems to be clustered towards the edges

#using subset to plot top 1000

#Load data and convert
network_data_subset_cc <-read.table(file = "graph_subset_rank1000_CC.txt")
names(network_data_subset_cc) <- c("from", "to")

network_data_subset_cc <-
  graph_from_data_frame(network_data_subset_cc, directed = F, vertices = NULL)


plot.igraph(
  network_data_subset_cc,
  layout = layout.kamada.kawai,
  edge.width = 2,
  vertex.color = V(network_data_subset_cc),
  vertex.label = NA,
  vertex.size = 3,
  vertex.label.cex = .5,
  edge.curved = .1,
  asp = -1,
  margin = -.03
)
title("Largest connected component Components of Top-Ranked Products",
      cex.main = .75,
      col.main = "Black")

#4 Data analysis
net_all <- read.table(file = "graph_complete.txt")
names(net_all) <- c("from", "to")

data_all_from<-as.data.frame(net_all$from)
data_all_to<-as.data.frame(net_all$to)

# Calculate Out & In Degrees

# convert data to grapnh object 
net_all <-graph_from_data_frame(net_all, directed = TRUE, vertices = NULL)

# Calculate out degree using degree command 
net_all$degree.out <-
  degree(
    net_all,
    mode = "out"
  )

# Calculate In degree using degree command 
net_all$degree.in <-
  degree(
    net_all,
    mode = "IN"
  )

#Plot the out-degree distribution

# create data frame 
net_outdeg<-as.data.frame(net_all$degree.out)

net_outdeg<-cbind.data.frame(rownames(net_outdeg),net_outdeg)
names(net_outdeg) <- c("from", "out degree")

mytheme <- theme_bw() +
  theme(
    text = element_text(family = "Times New Roman"),
    panel.border = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.line = element_line(color = "grey"),
    axis.text.x = element_text(size = 10, angle = 0),
    axis.text.y = element_text(size = 10, angle = 0),
    axis.title.y = element_text(size = 10),
    axis.title.x = element_text(size = 10)
  )

# Plot distribution
ggplot(net_outdeg, aes(x = net_outdeg$`out degree`)) +
  geom_histogram(
    stat = "bin" ,
    binwidth = 1,
    aes(fill = ..count..),
    show.legend = FALSE
  ) +
  mytheme +
  labs(title = "Out-Degree Distribution",
       x = "Number of Similar Products",
       y = "Number of Nodes") +
  scale_fill_gradient(low = "grey", high = "black")

# convert out degree to data frame 
net_indeg<-as.data.frame(net_all$degree.in)

# add rownames (i.e. From) 
net_indeg<-cbind.data.frame(rownames(net_indeg),net_indeg)
names(net_indeg) <- c("from", "in degree")

ggplot(net_indeg, aes(x = net_indeg$`in degree`)) +
  geom_histogram(
    stat = "bin",
    binwidth = 1,
    aes(fill = ..count..),
    show.legend = FALSE
  ) +
  mytheme +
  scale_y_continuous(labels = scales::comma) +
  labs(title = "In-Degree Distribution (similar product limit = 30)",
       x = "Number of Similar Products",
       y = "Number of Nodes") +
  scale_fill_gradient(low = "grey", high = "black") +
  xlim (-1, 30)

#observation - the new distribution is right skewed, compared the the previous distribution which was left skewed

ggplot(net_indeg, aes(x = net_indeg$`in degree`)) +
  geom_histogram(
    stat = "bin",
    binwidth = 1,
    aes(fill = ..count..),
    show.legend = FALSE
  ) +
  mytheme +
  scale_y_continuous(labels = scales::comma) +
  labs(title = "In-Degree Distribution",
       x = "Number of Similar Products",
       y = "Number of Nodes") +
  scale_fill_gradient(low = "grey", high = "black")

#Transform the x-axis of the previous graph to log scale, to get a better understanding of the distribution. Note here that you should have some products with 0 inbound links. This means that using the log of the x-axis will fail since log(0) will not be valid. Due to this, you should replace 0 with 0.1. Comment on what you observe.

# Replace 0 with 0.1
net_indeg_log <- as.data.frame(net_indeg)
net_indeg_log[net_indeg_log == 0] <- 0.1

# plot graph   
ggplot(net_indeg_log, aes(x = net_indeg_log$`in degree`)) +
  geom_histogram(
    stat = "bin",
    binwidth = 1,
    aes(fill = ..count..),
    show.legend = FALSE
  ) +
  mytheme +
  scale_x_log10() +
  scale_y_continuous(labels = scales::comma) +
  labs(title = "In-Degree Distribution Log Transform of similar products",
       x = "Log (Number of Similar Products)",
       y = "Number of Nodes") +
  scale_fill_gradient(low = "grey", high = "black")

#comment on what you observe - after log transforming the distribtuion looks very different, it is not heavily right skewed compared to the previous distribution

#Compute the average number of inbound co-purchase links, the standard deviation, and the maximum. Comment on the result.

paste("The Average number of inbound co-purchase links is:",
      round(mean(net_indeg$`in degree`), 1))

## [1] "The Average number of inbound co-purchase links is: 3.4"

paste("The Standard deviation of inbound co-purchase links is:",
      round(sd(net_indeg$`in degree`), 1))

## [1] "The Standard deviation of inbound co-purchase links is: 6"

paste("The maximum of inbound co-purchase links is:", max(net_indeg$`in degree`))

## [1] "The maximum of inbound co-purchase links is: 549"

#Report the names of the 10 products with the most inbound co-purchase links.

# read product names as a table from.txt file 
data_title <- read.csv(file = "id_to_titles.txt",  sep = "")

# rename columns 
names(data_title) = c("product", "name")

# join in-degree data frame with Title 
df_top10_full <- net_indeg %>%
  inner_join(data_title, by = c("from" = "product"))

# Create Data frame with top 10 
df_top10 <-
   as.data.frame(head(df_top10_full[order(df_top10_full$`in degree`, decreasing = T),],n=10))

#Report the names of the 10 products with the most inbound co-purchase links.
paste(df_top10$name)

##  [1] "Laura"                                                                                                                                  
##  [2] "Diagnostic and Statistical Manual of Mental Disorders DSM-IV-TR (Text Revision) (Diagnostic and Statistical Manual of Mental Disorders)"
##  [3] "Publication Manual of the American Psychological Association, Fifth Edition"                                                            
##  [4] "The Great Gatsby"                                                                                                                       
##  [5] "1001 Most Useful Spanish Words (Beginners' Guides)"                                                                                     
##  [6] "It Works"                                                                                                                               
##  [7] "Brown Bear, Brown Bear, What Do You See?"                                                                                               
##  [8] "Easy Spanish Phrase Book: Over 770 Basic Phrases for Everyday Use"                                                                      
##  [9] "The Prince"                                                                                                                             
## [10] "The TEMPEST"

Amazon Book Sales Cluster

Sayem Nasher

10-15-2022