1. Data Preperation
2. Network Structure Visualization
- 2.1 Plot the network using the information in the file graph_subset_rank1000.txt
- 2.2 Plot the network using the information in the file graph_subset_rank1000_cc.txt
3. Data analysis

# load the required libraries
library("igraph")
library("readr")
library("poweRlaw")
library("ggplot2")
library("scales")
library("cowplot")
library("psych")
library("dplyr")

1. Data Preperation

# load data files for analysis 
graph_subset_rank1000 <- read.graph(file="~/Desktop/M3 - Network Analytics/Data/graph_subset_rank1000.txt", 
                                    format ="ncol")
graph_subset_rank1000_cc <- read.graph(file="~/Desktop/M3 - Network Analytics/Data/graph_subset_rank1000_cc.txt", 
                                       format ="ncol")
graph_complete <- read.graph(file="~/Desktop/M3 - Network Analytics/Data/graph_complete.txt", 
                             format ="ncol")

## VERTEX & EDGES
# view vertex
V(graph_subset_rank1000)

## + 1355/1355 vertices, named, from ff230d1:
##    [1] 411653 94292  68951  478494 236897 265343 472765 153184 172503 424919
##   [11] 469074 48638  480433 220748 42974  491768 105110 291610 67371  78848 
##   [21] 120884 390297 212405 355494 325446 29203  349384 20195  444150 285340
##   [31] 5039   246337 21728  530653 265598 216323 354795 45094  96553  342416
##   [41] 49527  445728 363808 267451 30639  287962 20466  265468 88060  401507
##   [51] 353676 443284 33304  95210  124568 17994  374639 48316  535849 367733
##   [61] 83582  30695  468264 84492  225301 572    87559  315493 54800  488883
##   [71] 337679 209067 97692  35109  531651 108364 108187 13198  25341  65731 
##   [81] 117196 62210  178132 144372 458306 324938 214842 197455 334110 61751 
##   [91] 397309 97533  429060 100805 61718  313421 420909 514999 281633 178948
## + ... omitted several vertices

V(graph_subset_rank1000_cc)

## + 292/292 vertices, named, from 28cc0ba:
##   [1] 415305 112822 297722 116802 182411 539734 273044 267500 539367 538785
##  [11] 538546 540230 540153 539964 516279 542414 171675 541064 447339 82836 
##  [21] 48611  73768  274142 95554  73437  75527  426430 312068 128602 135171
##  [31] 241213 381095 12274  541914 198466 423769 1299   136432 529269 262229
##  [41] 540409 111494 52374  539257 539854 419575 340252 485290 185121 111591
##  [51] 201516 63986  150174 420056 77548  440768 190050 55147  92686  510545
##  [61] 313436 462332 11731  287328 433974 233148 330960 345354 454037 398931
##  [71] 96529  463542 178396 273273 541902 135011 113827 372908 538640 188833
##  [81] 139989 541281 541371 150987 338680 370635 51791  540248 243066 539088
##  [91] 265550 480146 116030 539872 178718 327275 239251 542314 70526  111255
## + ... omitted several vertices

V(graph_complete)

## + 366987/366987 vertices, named, from 0967b6d:
##     [1] 140890 11405  204319 1046   137479 170475 183024 95075  231749 405486
##    [11] 353453 272171 295744 293925 39760  447055 55313  364778 176964 454565
##    [21] 306149 284831 145536 203692 286635 82947  361694 495330 265893 351683
##    [31] 372405 33565  153741 381077 330917 526683 99307  416175 150565 195300
##    [41] 246253 396051 203463 47753  214354 478549 514761 27411  368005 95268 
##    [51] 371129 30450  21938  497078 110895 164533 133374 65027  391841 212653
##    [61] 159183 169131 303923 230888 356707 425231 18413  47628  106827 275827
##    [71] 488460 532140 53077  250126 11255  19748  89176  301046 360323 172976
##    [81] 46980  366961 446090 212522 522811 469602 375116 158006 295228 37570 
##    [91] 61404  82219  290610 310069 415430 61062  169245 263763 3325   240262
## + ... omitted several vertices

# vertex size
gorder(graph_subset_rank1000)

## [1] 1355

gorder(graph_subset_rank1000_cc)

## [1] 292

gorder(graph_complete)

## [1] 366987

# view edges
E(graph_subset_rank1000)

## + 2611/2611 edges from ff230d1 (vertex names):
##  [1] 411653--94292  68951 --478494 236897--265343 236897--265343 236897--472765
##  [6] 153184--172503 172503--424919 469074--48638  48638 --480433 220748--42974 
## [11] 220748--42974  491768--105110 105110--291610 491768--105110 67371 --78848 
## [16] 120884--390297 390297--212405 355494--325446 29203 --349384 349384--20195 
## [21] 349384--444150 349384--285340 5039  --246337 325446--21728  21728 --530653
## [26] 21728 --265598 355494--21728  216323--354795 45094 --96553  45094 --96553 
## [31] 342416--49527  445728--363808 267451--30639  287962--20466  153184--172503
## [36] 153184--265468 153184--88060  153184--401507 153184--424919 153184--353676
## [41] 153184--443284 153184--33304  153184--95210  124568--17994  17994 --374639
## [46] 17994 --48316  17994 --535849 17994 --367733 124568--367733 83582 --30695 
## + ... omitted several edges

E(graph_subset_rank1000_cc)

## + 604/604 edges from 28cc0ba (vertex names):
##  [1] 415305--112822 297722--116802 116802--182411 539734--273044 273044--267500
##  [6] 539367--538785 538785--538546 540230--540153 540153--539964 540153--516279
## [11] 540230--540153 540230--542414 540230--539964 171675--541064 539367--538785
## [16] 415305--539367 539367--447339 539367--82836  539367--48611  73768 --274142
## [21] 95554 --73437  540153--542414 540230--542414 542414--73437  539964--542414
## [26] 542414--95554  75527 --426430 312068--128602 128602--135171 128602--241213
## [31] 381095--12274  12274 --541914 12274 --198466 12274 --423769 12274 --1299  
## [36] 12274 --136432 12274 --529269 12274 --262229 312068--135171 128602--135171
## [41] 135171--241213 273044--540409 540409--111494 540409--52374  540409--539257
## [46] 540409--539854 540409--419575 540409--340252 485290--185121 111591--201516
## + ... omitted several edges

E(graph_complete)

## + 1231400/1231400 edges from 0967b6d (vertex names):
##  [1] 140890--11405  11405 --204319 1046  --137479 137479--170475 183024--95075 
##  [6] 95075 --231749 405486--353453 272171--295744 295744--293925 295744--39760 
## [11] 295744--447055 295744--55313  295744--364778 295744--176964 295744--454565
## [16] 295744--306149 295744--284831 272171--295744 272171--293925 272171--447055
## [21] 272171--176964 272171--454565 272171--306149 39760 --145536 55313 --145536
## [26] 284831--145536 295744--293925 272171--293925 293925--145536 293925--39760 
## [31] 293925--447055 293925--55313  293925--364778 293925--454565 293925--203692
## [36] 293925--284831 293925--286635 293925--82947  39760 --145536 39760 --55313 
## [41] 39760 --284831 361694--495330 495330--265893 272171--447055 447055--176964
## [46] 351683--372405 372405--33565  372405--153741 372405--381077 372405--330917
## + ... omitted several edges

# edge size
gsize(graph_subset_rank1000)

## [1] 2611

gsize(graph_subset_rank1000_cc)

## [1] 604

gsize(graph_complete)

## [1] 1231400

# Is the graph directed?
is.directed(graph_subset_rank1000)

## [1] FALSE

is.directed(graph_subset_rank1000_cc)

## [1] FALSE

is.directed(graph_complete)

## [1] FALSE

2. Network Structure Visualization

2.1 Plot the network using the information in the file graph_subset_rank1000.txt

graph_subset_rank1000_nicely = layout_nicely(graph_subset_rank1000)
set.seed(100)
plot(graph_subset_rank1000, 
     
     # === vertex
     vertex.color = rgb(0.8,0.4,0.3,0.8), # Node color
     vertex.frame.color = "black",        # Node border color
     vertex.shape="circle",               # One of “none”, “circle”, 
                                          # “square”, “csquare”, “rectangle” 
                                          # “crectangle”, “vrectangle”, “pie”, “raster”, or “sphere”
     vertex.size=5,                       # Size of the node (default is 15)
     vertex.size2=NA,                     # The second size of the node (e.g. for a rectangle)
     
     # === vertex label
     vertex.label=LETTERS[1:10],          # Character vector used to label the nodes
     vertex.label.color="white",
     vertex.label.family="Times",         # Font family of the label (e.g.“Times”, “Helvetica”)
     vertex.label.font=1,                 # Font: 1 plain, 2 bold, 3, italic, 4 bold italic, 5 symbol
     vertex.label.cex=0,                  # Font size (multiplication factor, device-dependent)
     vertex.label.dist=0,                 # Distance between the label and the vertex
     vertex.label.degree=0,               # The position of the label in relation to the vertex (use pi)
     
     # === Edge
     edge.color="black",                  # Edge color
     edge.width=1,                        # Edge width, defaults to 1
     edge.arrow.size=1,                   # Arrow size, defaults to 1
     edge.arrow.width=1,                  # Arrow width, defaults to 1
     edge.lty=3,                          # Line type, could be 0 or “blank”, 
                                          # 1 or “solid”, 2 or “dashed”, 3 or “dotted”, 
                                          # 4 or “dotdash”, 5 or “longdash”, 6 or “twodash”
     edge.curved=0.3,                     # Edge curvature, range 0-1 (FALSE sets it to 0, TRUE to 0.5)
     
     # === Layout
     #layout = graph_subset_rank1000_nicely
     layout = layout.kamada.kawai(graph_subset_rank1000)
)

Summary:

The plot of graph_subset_rank1000.txt shows distinct clusters of products with similar page rank and its co purchases. Looking at the graph the cluster observed at center seemed to share similar page rank compared to outer clusters

2.2 Plot the network using the information in the file graph_subset_rank1000_cc.txt

graph_subset_rank1000_cc_nicely = layout_nicely(graph_subset_rank1000_cc)
set.seed(100)
plot(graph_subset_rank1000_cc, 
     
     # === vertex
     vertex.color = rgb(0.8,0.4,0.3,0.8),   # Node color
     vertex.frame.color = "black",          # Node border color
     vertex.shape="circle",                 # One of “none”, “circle”, “square”, “csquare”, “rectangle” “crectangle”, “vrectangle”, “pie”,                                                      “raster”, or “sphere”
     vertex.size=5,                         # Size of the node (default is 15)
     vertex.size2=NA,                       # The second size of the node (e.g. for a rectangle)
     
     # === vertex label
     vertex.label=LETTERS[1:10],            # Character vector used to label the nodes
     vertex.label.color="white",
     vertex.label.family="Times",           # Font family of the label (e.g.“Times”, “Helvetica”)
     vertex.label.font=1,                   # Font: 1 plain, 2 bold, 3, italic, 4 bold italic, 5 symbol
     vertex.label.cex=0,                    # Font size (multiplication factor, device-dependent)
     vertex.label.dist=0,                   # Distance between the label and the vertex
     vertex.label.degree=0,                 # The position of the label in relation to the vertex (use pi)
     
     # === Edge
     edge.color="black",                    # Edge color
     edge.width=1,                          # Edge width, defaults to 1
     edge.arrow.size=1,                     # Arrow size, defaults to 1
     edge.arrow.width=1,                    # Arrow width, defaults to 1
     edge.lty=3,                            # Line type, could be 0 or “blank”, 1 or “solid”, 
                                            # 2 or “dashed”, 3 or “dotted”, 4 or “dotdash”, 5 or “longdash”, 6 or “twodash”
     edge.curved=0.3,                       # Edge curvature, range 0-1 (FALSE sets it to 0, TRUE to 0.5)
     
     # === Layout
     #layout = graph_subset_rank1000_cc_nicely
     layout = layout.kamada.kawai(graph_subset_rank1000_cc)
)

Summary:

The plot for graph_subset_rank1000_cc.txt hints the products which are connected through their frequent purchase and having a page rank under 1000. The density of the graph is much more easily observable since it highlights mainly on the connectivity amongst the products

3. Data analysis

3.1 Out-Degree Distribution

graph_complete_table = read.table(file="~/Desktop/M3 - Network Analytics/Data/graph_complete.txt", 
                                  header = FALSE)
graph_complete_df <- graph.data.frame(graph_complete_table)

# List of out degree
graph_complete_out_deg <- degree(graph_complete_df, mode = "out")

# Let's count the frequencies of each degree
graph_complete_out_deg_hist <- as.data.frame(table(graph_complete_out_deg))

# convert the first column to numbers
graph_complete_out_deg_hist[,1] <- as.numeric(paste(graph_complete_out_deg_hist[,1]))

# plot out degree distribution
ggplot(graph_complete_out_deg_hist, aes(x = graph_complete_out_deg, y = Freq)) +
        geom_point() +
        scale_x_continuous("Out-Degree\n(nodes with a->b amount of connections)") +
        scale_y_continuous("Frequency\n(how many of them)") +
        ggtitle("Out-Degree Distribution") +
        theme_light()

3.2 In-Degree Distribution

# List of in degree
graph_complete_in_deg <- degree(graph_complete_df, mode = "in")

# Let's count the frequencies of each degree
graph_complete_in_deg_hist <- as.data.frame(table(graph_complete_in_deg))

# convert the first column to numbers
graph_complete_in_deg_hist[,1] <- as.numeric(paste(graph_complete_in_deg_hist[,1]))

# plot in-degree distribution
ggplot(graph_complete_in_deg_hist, aes(x = graph_complete_in_deg, y = Freq)) +
        geom_point() +
        scale_x_continuous("In-Degree\n(nodes with b->a amount of connections)") +
        scale_y_continuous("Frequency\n(how many of them)", labels = comma) +
        ggtitle("In-Degree Distribution") +
        theme_light()

Summary: The in-distribution is apparently different from out-distribution. Each product contains a maximum of 570 in-bound links compared to 5 out-bound links. The graph tells us as the number of b->a links increases the frequency of such products having such link decreases. This also tells us that there the frequency of a->b links are higher than b->a links in the dataset

3.3 Plot log transformation of x-axis of in-bound distribution

# plot log transformation of x-axis of inbound distribution
# update products with zero inbound links to 0.1 to deal with log transformation
graph_complete_in_deg_hist_log = graph_complete_in_deg_hist
graph_complete_in_deg_hist_log$graph_complete_in_deg = 
        ifelse(graph_complete_in_deg_hist_log$graph_complete_in_deg==0,0.1,
               graph_complete_in_deg_hist_log$graph_complete_in_deg)

# plot
ggplot(graph_complete_in_deg_hist_log, aes(x = graph_complete_in_deg, y = Freq)) +
        geom_point() +
        scale_x_continuous("In-Degree\n(nodes with b->a amount of connections)", trans = "log10") +
        scale_y_continuous("Frequency\n(how many of them)", labels = comma) +
        ggtitle("In-Degree Distribution") +
        theme_light()

# show both plots side by side
outbound_plot1 = ggplot(graph_complete_out_deg_hist, aes(x = graph_complete_out_deg, y = Freq)) +
        geom_point() +
        scale_x_continuous("Out-Degree\n(nodes with a->b amount of connections)") +
        scale_y_continuous("Frequency\n(how many of them)") +
        ggtitle("Out-Degree Distribution") +
        theme_light()
inbound_plot2 = ggplot(graph_complete_in_deg_hist_log, aes(x = graph_complete_in_deg, y = Freq)) +
        geom_point() +
        scale_x_continuous("In-Degree\n(nodes with b->a log10 amount of connections)", trans = "log10") +
        scale_y_continuous("Frequency\n(how many of them)", labels = comma) +
        ggtitle("In-Degree log Distribution") +
        theme_light()
plot_grid(outbound_plot1,inbound_plot2)

Summary:

The dataset has high volume products with of low inbound links and low frequency of products with more than 10 (log transformed) inbound links. The frequency almost flattens to zero as the number of inbound links in the dataset increases more than around 10 (log transformed) units.

3.4 Compute mean, standard deviation and upper limits for in-bound links

cat("The average number of inbound co-purchase links:",
    round(mean(graph_complete_in_deg_hist$graph_complete_in_deg),2))

## The average number of inbound co-purchase links: 88.28

cat("The standard deviation of inbound co-purchase links:", 
    round(sd(graph_complete_in_deg_hist$graph_complete_in_deg),2))

## The standard deviation of inbound co-purchase links: 70.38

cat("The maximum value of inbound co-purchase links:", 
    max(graph_complete_in_deg_hist$graph_complete_in_deg))

## The maximum value of inbound co-purchase links: 549

3.5 Report the names of the 10 products with the most inbound co-purchase links

# load data
amazon_grapgh_complete <- read.table(file="~/Desktop/M3 - Network Analytics/Data/graph_complete.txt")

# rename table headers
names(amazon_grapgh_complete)<- c("from", "to")

# calculate total nodes
a1 <- amazon_grapgh_complete %>%
        distinct(from) %>%
        rename(label = "from")

b1 <- amazon_grapgh_complete %>%
        distinct(to) %>%
        rename(label = "to")

amazon_nodes_all <- full_join(a1, b1, by = "label")
amazon_nodes_all <- as.data.frame(amazon_nodes_all)

# Create In link weights for each from-to combination
amazon_in_deg <- amazon_grapgh_complete %>%
        group_by(to, from) %>%
        summarise(weight = n()) %>%
        ungroup()

## `summarise()` has grouped output by 'to'. You can override using the `.groups` argument.

# Sum the weight count to calculate total In degree
amazon_in_deg <-
        aggregate(amazon_in_deg$weight,
                  by = list(product = amazon_in_deg$to),
                  FUN = sum)

# combine in-degree with nodes list
amazon_in_deg_df <- amazon_in_deg %>%
        right_join(amazon_nodes_all, by = c("product" = "label"))

amazon_in_deg_df <- as.data.frame(amazon_in_deg_df)

# convert NA's to 0
amazon_in_deg_df[is.na(amazon_in_deg_df)] <- 0

# read product names as a table from.txt file 
amazon_titles <- read.table(file = "~/Desktop/M3 - Network Analytics/Data/id_to_titles.txt", fill = TRUE)

# loose first row to delete copied names
amazon_titles <- amazon_titles[-1, ]

# rename columns 
names(amazon_titles) = c("product", "name")

#convert product number from factor to integer for joining 
amazon_titles$product <- as.integer(amazon_titles$product)

# join in-degree data frame with Title 
amazon_top10_products_list <- amazon_in_deg_df %>%
        inner_join(amazon_titles, by = c("product" = "product"))

# Create Data frame with top 10 
amazon_top10_products_df <-
        as.data.frame(head(amazon_top10_products_list[order(amazon_top10_products_list$x, decreasing = T),],n=10))

print("The name of top 10 products with most inbound co-purchase are : ")

## [1] "The name of top 10 products with most inbound co-purchase are : "

print(amazon_top10_products_df$name)

##  [1] "Laura"                                                                                                                                  
##  [2] "Diagnostic and Statistical Manual of Mental Disorders DSM-IV-TR (Text Revision) (Diagnostic and Statistical Manual of Mental Disorders)"
##  [3] "Publication Manual of the American Psychological Association, Fifth Edition"                                                            
##  [4] "The Great Gatsby"                                                                                                                       
##  [5] "1001 Most Useful Spanish Words (Beginners' Guides)"                                                                                     
##  [6] "It Works"                                                                                                                               
##  [7] "Brown Bear, Brown Bear, What Do You See?"                                                                                               
##  [8] "Easy Spanish Phrase Book: Over 770 Basic Phrases for Everyday Use"                                                                      
##  [9] "The Prince"                                                                                                                             
## [10] "The TEMPEST"

Pre-Module Network Data Analysis Assignment

Nag Rajendran

10/25/2021

1. Data Preperation

2. Network Structure Visualization

2.1 Plot the network using the information in the file graph_subset_rank1000.txt

2.2 Plot the network using the information in the file graph_subset_rank1000_cc.txt

3. Data analysis

3.1 Out-Degree Distribution

3.2 In-Degree Distribution

3.3 Plot log transformation of x-axis of in-bound distribution

3.4 Compute mean, standard deviation and upper limits for in-bound links

3.5 Report the names of the 10 products with the most inbound co-purchase links