Network Analytics_Amazon Pre-Class (V2)

Network structure visualization

2.1. Plot the network using the information in the file “graph subset rank1000.txt”. Note that this is not the complete network, but only a subset of edges between top-ranked products. By visualizing the graph, you get an idea of the structure of the network you will be working on. In addition to plotting, comment on anything interesting you observe

library(igraph)

#Read Data
data_rank100 <- read.table(file = "graph_subset_rank1000.txt")
names(data_rank100) <- c("from", "to")

# Convert data frame to graph  data
df_rank100 <-
  graph_from_data_frame(data_rank100, directed = FALSE, vertices = NULL)

# Plot graph
plot.igraph(
  df_rank100,
  layout = layout.kamada.kawai,
  vertex.color = 'green',
  vertex.label = NA,
  vertex.size = 4,
  edge.curved = .1,
  asp = -1,
  margin = -.03
)
title("Network Plot (Layout: Kamada-Kawai)",
      cex.main = .75,
      col.main = "Black")

2.2. Now, use the file “graph subset rank1000 cc.txt” to plot only the largest connected component in the above network. You should be able to reuse your code from above on the new data.

#Read Data
data_rank100_cc <- read.table(file = "graph_subset_rank1000_CC.txt")
names(data_rank100_cc) <- c("from", "to")

# Convert data frame to grapj data
df_rank100_cc <-
  graph_from_data_frame(data_rank100_cc, directed = F, vertices = NULL)


# Plot graph 
plot.igraph(
  df_rank100_cc,
  layout = layout.kamada.kawai,
  edge.width = 2,
  vertex.color = V(df_rank100_cc),
  vertex.label = NA,
  vertex.size = 3,
  vertex.label.cex = .5,
  edge.curved = .1,
  asp = -1,
  margin = -.03
)
title("Largest connected component Components of Top-Ranked Products (With Layout Kamada-Kawai)",
      cex.main = .75,
      col.main = "Black")

Data analysis

#Read Data
data_all <- read.table(file = "graph_complete.txt")
names(data_all) <- c("from", "to")

data_all_from<-as.data.frame(data_all$from)
data_all_to<-as.data.frame(data_all$to)

# Calculate Out & In Degrees

# convert data to grapnh object 
data_all <-graph_from_data_frame(data_all, directed = TRUE, vertices = NULL)

# Calculate out degree using degree command 
data_all$degree.out <-
  degree(
    data_all,
    mode = "out"
  )

# Calculate In degree using degree command 
data_all$degree.in <-
  degree(
    data_all,
    mode = "IN"
  )

3.1. Plot the out-degree distribution of our dataset (x-axis number of similar products, y-axis number of nodes). That is, for each product a, count the number of outgoing links to another product page b such that a -> b.

# convert out degree to data frame 
data_outdeg<-as.data.frame(data_all$degree.out)

# add rownames (i.e. From) 
data_outdeg<-cbind.data.frame(rownames(data_outdeg),data_outdeg)
names(data_outdeg) <- c("from", "out degree")

# Define a theme for the plot 
mytheme <- theme_bw() +
  theme(
    text = element_text(family = "Calibri"),
    panel.border = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.line = element_line(color = "grey"),
    axis.text.x = element_text(size = 10, angle = 0),
    axis.text.y = element_text(size = 10, angle = 0),
    axis.title.y = element_text(size = 10),
    axis.title.x = element_text(size = 10)
  )

# Plot distribution
ggplot(data_outdeg, aes(x = data_outdeg$`out degree`)) +
  geom_histogram(
    stat = "bin" ,
    binwidth = 1,
    aes(fill = ..count..),
    show.legend = FALSE
  ) +
  mytheme +
  labs(title = "Out-Degree Distribution",
       x = "Number of Similar Products",
       y = "Number of Nodes") +
  scale_fill_gradient(low = "yellow", high = "red")

3.2. Above, you should have found that each product contains a maximum of five outbound links to similar products in the dataset. Now, plot the in-degree distribution of our dataset (x-axis number of similar products, y-axis number of nodes). That is, for each product a, count the number of incoming links from another product page b such that b -> a. You can use the same steps outlined above. Is the distribution different? Comment on what you observe

# convert out degree to data frame 
data_indeg<-as.data.frame(data_all$degree.in)

# add rownames (i.e. From) 
data_indeg<-cbind.data.frame(rownames(data_indeg),data_indeg)
names(data_indeg) <- c("from", "in degree")

# Plot distribution 

ggplot(data_indeg, aes(x = data_indeg$`in degree`)) +
  geom_histogram(
    stat = "bin",
    binwidth = 1,
    aes(fill = ..count..),
    show.legend = FALSE
  ) +
  mytheme +
  scale_y_continuous(labels = scales::comma) +
  labs(title = "In-Degree Distribution (similar product limit = 30)",
       x = "Number of Similar Products",
       y = "Number of Nodes") +
  scale_fill_gradient(low = "green", high = "blue") +
  xlim (-1, 30)

ggplot(data_indeg, aes(x = data_indeg$`in degree`)) +
  geom_histogram(
    stat = "bin",
    binwidth = 1,
    aes(fill = ..count..),
    show.legend = FALSE
  ) +
  mytheme +
  scale_y_continuous(labels = scales::comma) +
  labs(title = "In-Degree Distribution",
       x = "Number of Similar Products",
       y = "Number of Nodes") +
  scale_fill_gradient(low = "green", high = "blue")

3.3. Transform the x-axis of the previous graph to log scale, to get a better understanding of the distribution. Note here that you should have some products with 0 inbound links. This means that using the log of the x-axis will fail since log(0) will not be valid. Due to this, you should replace 0 with 0.1. Comment on what you observe.

# Replace 0 with 0.1 for log trnaformation

df_indeg_log <- as.data.frame(data_indeg)

df_indeg_log[df_indeg_log == 0] <- 0.1

# plot graph   
ggplot(df_indeg_log, aes(x = df_indeg_log$`in degree`)) +
  geom_histogram(
    stat = "bin",
    binwidth = 1,
    aes(fill = ..count..),
    show.legend = FALSE
  ) +
  mytheme +
  scale_x_log10() +
  scale_y_continuous(labels = scales::comma) +
  labs(title = "In-Degree Distribution | Log (similar products)",
       x = "Log (Number of Similar Products)",
       y = "Number of Nodes") +
  scale_fill_gradient(low = "green", high = "blue")

3.4. Compute the average number of inbound co-purchase links, the standard deviation, and the maximum. Comment on the result.

paste("The Average number of inbound co-purchase links is:",
      round(mean(data_indeg$`in degree`), 1))

## [1] "The Average number of inbound co-purchase links is: 3.4"

paste("The Standard deviation of inbound co-purchase links is:",
      round(sd(data_indeg$`in degree`), 1))

## [1] "The Standard deviation of inbound co-purchase links is: 6"

paste("The maximum of inbound co-purchase links is:", max(data_indeg$`in degree`))

## [1] "The maximum of inbound co-purchase links is: 549"

3.5. Report the names of the 10 products with the most inbound co-purchase links.

# read product names as a table from.txt file 
data_title <- read.csv(file = "id_to_titles.txt",  sep = "")

# rename columns 
names(data_title) = c("product", "name")

# join in-degree data frame with Title 
df_top10_full <- data_indeg %>%
  inner_join(data_title, by = c("from" = "product"))

# Create Data frame with top 10 
df_top10 <-
   as.data.frame(head(df_top10_full[order(df_top10_full$`in degree`, decreasing = T),],n=10))

paste("The name of top 10 products with most inbound co-purchase are : ")

## [1] "The name of top 10 products with most inbound co-purchase are : "

paste(df_top10$name)

##  [1] "Laura"                                                                                                                                  
##  [2] "Diagnostic and Statistical Manual of Mental Disorders DSM-IV-TR (Text Revision) (Diagnostic and Statistical Manual of Mental Disorders)"
##  [3] "Publication Manual of the American Psychological Association, Fifth Edition"                                                            
##  [4] "The Great Gatsby"                                                                                                                       
##  [5] "1001 Most Useful Spanish Words (Beginners' Guides)"                                                                                     
##  [6] "It Works"                                                                                                                               
##  [7] "Brown Bear, Brown Bear, What Do You See?"                                                                                               
##  [8] "Easy Spanish Phrase Book: Over 770 Basic Phrases for Everyday Use"                                                                      
##  [9] "The Prince"                                                                                                                             
## [10] "The TEMPEST"

Network Analytics_Amazon Pre-Class (V2)

Anubhav Rustogi

10/13/2019