# load the required libraries
library("igraph")
library("readr")
library("poweRlaw")
library("ggplot2")
library("scales")
library("cowplot")
library("psych")
library("dplyr")
# load data files for analysis
graph_subset_rank1000 <- read.graph(file="~/Desktop/M3 - Network Analytics/Data/graph_subset_rank1000.txt",
format ="ncol")
graph_subset_rank1000_cc <- read.graph(file="~/Desktop/M3 - Network Analytics/Data/graph_subset_rank1000_cc.txt",
format ="ncol")
graph_complete <- read.graph(file="~/Desktop/M3 - Network Analytics/Data/graph_complete.txt",
format ="ncol")
## VERTEX & EDGES
# view vertex
V(graph_subset_rank1000)
## + 1355/1355 vertices, named, from ff230d1:
## [1] 411653 94292 68951 478494 236897 265343 472765 153184 172503 424919
## [11] 469074 48638 480433 220748 42974 491768 105110 291610 67371 78848
## [21] 120884 390297 212405 355494 325446 29203 349384 20195 444150 285340
## [31] 5039 246337 21728 530653 265598 216323 354795 45094 96553 342416
## [41] 49527 445728 363808 267451 30639 287962 20466 265468 88060 401507
## [51] 353676 443284 33304 95210 124568 17994 374639 48316 535849 367733
## [61] 83582 30695 468264 84492 225301 572 87559 315493 54800 488883
## [71] 337679 209067 97692 35109 531651 108364 108187 13198 25341 65731
## [81] 117196 62210 178132 144372 458306 324938 214842 197455 334110 61751
## [91] 397309 97533 429060 100805 61718 313421 420909 514999 281633 178948
## + ... omitted several vertices
V(graph_subset_rank1000_cc)
## + 292/292 vertices, named, from 28cc0ba:
## [1] 415305 112822 297722 116802 182411 539734 273044 267500 539367 538785
## [11] 538546 540230 540153 539964 516279 542414 171675 541064 447339 82836
## [21] 48611 73768 274142 95554 73437 75527 426430 312068 128602 135171
## [31] 241213 381095 12274 541914 198466 423769 1299 136432 529269 262229
## [41] 540409 111494 52374 539257 539854 419575 340252 485290 185121 111591
## [51] 201516 63986 150174 420056 77548 440768 190050 55147 92686 510545
## [61] 313436 462332 11731 287328 433974 233148 330960 345354 454037 398931
## [71] 96529 463542 178396 273273 541902 135011 113827 372908 538640 188833
## [81] 139989 541281 541371 150987 338680 370635 51791 540248 243066 539088
## [91] 265550 480146 116030 539872 178718 327275 239251 542314 70526 111255
## + ... omitted several vertices
V(graph_complete)
## + 366987/366987 vertices, named, from 0967b6d:
## [1] 140890 11405 204319 1046 137479 170475 183024 95075 231749 405486
## [11] 353453 272171 295744 293925 39760 447055 55313 364778 176964 454565
## [21] 306149 284831 145536 203692 286635 82947 361694 495330 265893 351683
## [31] 372405 33565 153741 381077 330917 526683 99307 416175 150565 195300
## [41] 246253 396051 203463 47753 214354 478549 514761 27411 368005 95268
## [51] 371129 30450 21938 497078 110895 164533 133374 65027 391841 212653
## [61] 159183 169131 303923 230888 356707 425231 18413 47628 106827 275827
## [71] 488460 532140 53077 250126 11255 19748 89176 301046 360323 172976
## [81] 46980 366961 446090 212522 522811 469602 375116 158006 295228 37570
## [91] 61404 82219 290610 310069 415430 61062 169245 263763 3325 240262
## + ... omitted several vertices
# vertex size
gorder(graph_subset_rank1000)
## [1] 1355
gorder(graph_subset_rank1000_cc)
## [1] 292
gorder(graph_complete)
## [1] 366987
# view edges
E(graph_subset_rank1000)
## + 2611/2611 edges from ff230d1 (vertex names):
## [1] 411653--94292 68951 --478494 236897--265343 236897--265343 236897--472765
## [6] 153184--172503 172503--424919 469074--48638 48638 --480433 220748--42974
## [11] 220748--42974 491768--105110 105110--291610 491768--105110 67371 --78848
## [16] 120884--390297 390297--212405 355494--325446 29203 --349384 349384--20195
## [21] 349384--444150 349384--285340 5039 --246337 325446--21728 21728 --530653
## [26] 21728 --265598 355494--21728 216323--354795 45094 --96553 45094 --96553
## [31] 342416--49527 445728--363808 267451--30639 287962--20466 153184--172503
## [36] 153184--265468 153184--88060 153184--401507 153184--424919 153184--353676
## [41] 153184--443284 153184--33304 153184--95210 124568--17994 17994 --374639
## [46] 17994 --48316 17994 --535849 17994 --367733 124568--367733 83582 --30695
## + ... omitted several edges
E(graph_subset_rank1000_cc)
## + 604/604 edges from 28cc0ba (vertex names):
## [1] 415305--112822 297722--116802 116802--182411 539734--273044 273044--267500
## [6] 539367--538785 538785--538546 540230--540153 540153--539964 540153--516279
## [11] 540230--540153 540230--542414 540230--539964 171675--541064 539367--538785
## [16] 415305--539367 539367--447339 539367--82836 539367--48611 73768 --274142
## [21] 95554 --73437 540153--542414 540230--542414 542414--73437 539964--542414
## [26] 542414--95554 75527 --426430 312068--128602 128602--135171 128602--241213
## [31] 381095--12274 12274 --541914 12274 --198466 12274 --423769 12274 --1299
## [36] 12274 --136432 12274 --529269 12274 --262229 312068--135171 128602--135171
## [41] 135171--241213 273044--540409 540409--111494 540409--52374 540409--539257
## [46] 540409--539854 540409--419575 540409--340252 485290--185121 111591--201516
## + ... omitted several edges
E(graph_complete)
## + 1231400/1231400 edges from 0967b6d (vertex names):
## [1] 140890--11405 11405 --204319 1046 --137479 137479--170475 183024--95075
## [6] 95075 --231749 405486--353453 272171--295744 295744--293925 295744--39760
## [11] 295744--447055 295744--55313 295744--364778 295744--176964 295744--454565
## [16] 295744--306149 295744--284831 272171--295744 272171--293925 272171--447055
## [21] 272171--176964 272171--454565 272171--306149 39760 --145536 55313 --145536
## [26] 284831--145536 295744--293925 272171--293925 293925--145536 293925--39760
## [31] 293925--447055 293925--55313 293925--364778 293925--454565 293925--203692
## [36] 293925--284831 293925--286635 293925--82947 39760 --145536 39760 --55313
## [41] 39760 --284831 361694--495330 495330--265893 272171--447055 447055--176964
## [46] 351683--372405 372405--33565 372405--153741 372405--381077 372405--330917
## + ... omitted several edges
# edge size
gsize(graph_subset_rank1000)
## [1] 2611
gsize(graph_subset_rank1000_cc)
## [1] 604
gsize(graph_complete)
## [1] 1231400
# Is the graph directed?
is.directed(graph_subset_rank1000)
## [1] FALSE
is.directed(graph_subset_rank1000_cc)
## [1] FALSE
is.directed(graph_complete)
## [1] FALSE
graph_subset_rank1000_nicely = layout_nicely(graph_subset_rank1000)
set.seed(100)
plot(graph_subset_rank1000,
# === vertex
vertex.color = rgb(0.8,0.4,0.3,0.8), # Node color
vertex.frame.color = "black", # Node border color
vertex.shape="circle", # One of “none”, “circle”,
# “square”, “csquare”, “rectangle”
# “crectangle”, “vrectangle”, “pie”, “raster”, or “sphere”
vertex.size=5, # Size of the node (default is 15)
vertex.size2=NA, # The second size of the node (e.g. for a rectangle)
# === vertex label
vertex.label=LETTERS[1:10], # Character vector used to label the nodes
vertex.label.color="white",
vertex.label.family="Times", # Font family of the label (e.g.“Times”, “Helvetica”)
vertex.label.font=1, # Font: 1 plain, 2 bold, 3, italic, 4 bold italic, 5 symbol
vertex.label.cex=0, # Font size (multiplication factor, device-dependent)
vertex.label.dist=0, # Distance between the label and the vertex
vertex.label.degree=0, # The position of the label in relation to the vertex (use pi)
# === Edge
edge.color="black", # Edge color
edge.width=1, # Edge width, defaults to 1
edge.arrow.size=1, # Arrow size, defaults to 1
edge.arrow.width=1, # Arrow width, defaults to 1
edge.lty=3, # Line type, could be 0 or “blank”,
# 1 or “solid”, 2 or “dashed”, 3 or “dotted”,
# 4 or “dotdash”, 5 or “longdash”, 6 or “twodash”
edge.curved=0.3, # Edge curvature, range 0-1 (FALSE sets it to 0, TRUE to 0.5)
# === Layout
#layout = graph_subset_rank1000_nicely
layout = layout.kamada.kawai(graph_subset_rank1000)
)
Summary:
The plot of graph_subset_rank1000.txt shows distinct clusters of products with similar page rank and its co purchases. Looking at the graph the cluster observed at center seemed to share similar page rank compared to outer clusters
graph_subset_rank1000_cc_nicely = layout_nicely(graph_subset_rank1000_cc)
set.seed(100)
plot(graph_subset_rank1000_cc,
# === vertex
vertex.color = rgb(0.8,0.4,0.3,0.8), # Node color
vertex.frame.color = "black", # Node border color
vertex.shape="circle", # One of “none”, “circle”, “square”, “csquare”, “rectangle” “crectangle”, “vrectangle”, “pie”, “raster”, or “sphere”
vertex.size=5, # Size of the node (default is 15)
vertex.size2=NA, # The second size of the node (e.g. for a rectangle)
# === vertex label
vertex.label=LETTERS[1:10], # Character vector used to label the nodes
vertex.label.color="white",
vertex.label.family="Times", # Font family of the label (e.g.“Times”, “Helvetica”)
vertex.label.font=1, # Font: 1 plain, 2 bold, 3, italic, 4 bold italic, 5 symbol
vertex.label.cex=0, # Font size (multiplication factor, device-dependent)
vertex.label.dist=0, # Distance between the label and the vertex
vertex.label.degree=0, # The position of the label in relation to the vertex (use pi)
# === Edge
edge.color="black", # Edge color
edge.width=1, # Edge width, defaults to 1
edge.arrow.size=1, # Arrow size, defaults to 1
edge.arrow.width=1, # Arrow width, defaults to 1
edge.lty=3, # Line type, could be 0 or “blank”, 1 or “solid”,
# 2 or “dashed”, 3 or “dotted”, 4 or “dotdash”, 5 or “longdash”, 6 or “twodash”
edge.curved=0.3, # Edge curvature, range 0-1 (FALSE sets it to 0, TRUE to 0.5)
# === Layout
#layout = graph_subset_rank1000_cc_nicely
layout = layout.kamada.kawai(graph_subset_rank1000_cc)
)
Summary:
The plot for graph_subset_rank1000_cc.txt hints the products which are connected through their frequent purchase and having a page rank under 1000. The density of the graph is much more easily observable since it highlights mainly on the connectivity amongst the products
graph_complete_table = read.table(file="~/Desktop/M3 - Network Analytics/Data/graph_complete.txt",
header = FALSE)
graph_complete_df <- graph.data.frame(graph_complete_table)
# List of out degree
graph_complete_out_deg <- degree(graph_complete_df, mode = "out")
# Let's count the frequencies of each degree
graph_complete_out_deg_hist <- as.data.frame(table(graph_complete_out_deg))
# convert the first column to numbers
graph_complete_out_deg_hist[,1] <- as.numeric(paste(graph_complete_out_deg_hist[,1]))
# plot out degree distribution
ggplot(graph_complete_out_deg_hist, aes(x = graph_complete_out_deg, y = Freq)) +
geom_point() +
scale_x_continuous("Out-Degree\n(nodes with a->b amount of connections)") +
scale_y_continuous("Frequency\n(how many of them)") +
ggtitle("Out-Degree Distribution") +
theme_light()
# List of in degree
graph_complete_in_deg <- degree(graph_complete_df, mode = "in")
# Let's count the frequencies of each degree
graph_complete_in_deg_hist <- as.data.frame(table(graph_complete_in_deg))
# convert the first column to numbers
graph_complete_in_deg_hist[,1] <- as.numeric(paste(graph_complete_in_deg_hist[,1]))
# plot in-degree distribution
ggplot(graph_complete_in_deg_hist, aes(x = graph_complete_in_deg, y = Freq)) +
geom_point() +
scale_x_continuous("In-Degree\n(nodes with b->a amount of connections)") +
scale_y_continuous("Frequency\n(how many of them)", labels = comma) +
ggtitle("In-Degree Distribution") +
theme_light()
Summary: The in-distribution is apparently different from out-distribution. Each product contains a maximum of 570 in-bound links compared to 5 out-bound links. The graph tells us as the number of b->a links increases the frequency of such products having such link decreases. This also tells us that there the frequency of a->b links are higher than b->a links in the dataset
# plot log transformation of x-axis of inbound distribution
# update products with zero inbound links to 0.1 to deal with log transformation
graph_complete_in_deg_hist_log = graph_complete_in_deg_hist
graph_complete_in_deg_hist_log$graph_complete_in_deg =
ifelse(graph_complete_in_deg_hist_log$graph_complete_in_deg==0,0.1,
graph_complete_in_deg_hist_log$graph_complete_in_deg)
# plot
ggplot(graph_complete_in_deg_hist_log, aes(x = graph_complete_in_deg, y = Freq)) +
geom_point() +
scale_x_continuous("In-Degree\n(nodes with b->a amount of connections)", trans = "log10") +
scale_y_continuous("Frequency\n(how many of them)", labels = comma) +
ggtitle("In-Degree Distribution") +
theme_light()
# show both plots side by side
outbound_plot1 = ggplot(graph_complete_out_deg_hist, aes(x = graph_complete_out_deg, y = Freq)) +
geom_point() +
scale_x_continuous("Out-Degree\n(nodes with a->b amount of connections)") +
scale_y_continuous("Frequency\n(how many of them)") +
ggtitle("Out-Degree Distribution") +
theme_light()
inbound_plot2 = ggplot(graph_complete_in_deg_hist_log, aes(x = graph_complete_in_deg, y = Freq)) +
geom_point() +
scale_x_continuous("In-Degree\n(nodes with b->a log10 amount of connections)", trans = "log10") +
scale_y_continuous("Frequency\n(how many of them)", labels = comma) +
ggtitle("In-Degree log Distribution") +
theme_light()
plot_grid(outbound_plot1,inbound_plot2)
Summary:
The dataset has high volume products with of low inbound links and low frequency of products with more than 10 (log transformed) inbound links. The frequency almost flattens to zero as the number of inbound links in the dataset increases more than around 10 (log transformed) units.
cat("The average number of inbound co-purchase links:",
round(mean(graph_complete_in_deg_hist$graph_complete_in_deg),2))
## The average number of inbound co-purchase links: 88.28
cat("The standard deviation of inbound co-purchase links:",
round(sd(graph_complete_in_deg_hist$graph_complete_in_deg),2))
## The standard deviation of inbound co-purchase links: 70.38
cat("The maximum value of inbound co-purchase links:",
max(graph_complete_in_deg_hist$graph_complete_in_deg))
## The maximum value of inbound co-purchase links: 549
# load data
amazon_grapgh_complete <- read.table(file="~/Desktop/M3 - Network Analytics/Data/graph_complete.txt")
# rename table headers
names(amazon_grapgh_complete)<- c("from", "to")
# calculate total nodes
a1 <- amazon_grapgh_complete %>%
distinct(from) %>%
rename(label = "from")
b1 <- amazon_grapgh_complete %>%
distinct(to) %>%
rename(label = "to")
amazon_nodes_all <- full_join(a1, b1, by = "label")
amazon_nodes_all <- as.data.frame(amazon_nodes_all)
# Create In link weights for each from-to combination
amazon_in_deg <- amazon_grapgh_complete %>%
group_by(to, from) %>%
summarise(weight = n()) %>%
ungroup()
## `summarise()` has grouped output by 'to'. You can override using the `.groups` argument.
# Sum the weight count to calculate total In degree
amazon_in_deg <-
aggregate(amazon_in_deg$weight,
by = list(product = amazon_in_deg$to),
FUN = sum)
# combine in-degree with nodes list
amazon_in_deg_df <- amazon_in_deg %>%
right_join(amazon_nodes_all, by = c("product" = "label"))
amazon_in_deg_df <- as.data.frame(amazon_in_deg_df)
# convert NA's to 0
amazon_in_deg_df[is.na(amazon_in_deg_df)] <- 0
# read product names as a table from.txt file
amazon_titles <- read.table(file = "~/Desktop/M3 - Network Analytics/Data/id_to_titles.txt", fill = TRUE)
# loose first row to delete copied names
amazon_titles <- amazon_titles[-1, ]
# rename columns
names(amazon_titles) = c("product", "name")
#convert product number from factor to integer for joining
amazon_titles$product <- as.integer(amazon_titles$product)
# join in-degree data frame with Title
amazon_top10_products_list <- amazon_in_deg_df %>%
inner_join(amazon_titles, by = c("product" = "product"))
# Create Data frame with top 10
amazon_top10_products_df <-
as.data.frame(head(amazon_top10_products_list[order(amazon_top10_products_list$x, decreasing = T),],n=10))
print("The name of top 10 products with most inbound co-purchase are : ")
## [1] "The name of top 10 products with most inbound co-purchase are : "
print(amazon_top10_products_df$name)
## [1] "Laura"
## [2] "Diagnostic and Statistical Manual of Mental Disorders DSM-IV-TR (Text Revision) (Diagnostic and Statistical Manual of Mental Disorders)"
## [3] "Publication Manual of the American Psychological Association, Fifth Edition"
## [4] "The Great Gatsby"
## [5] "1001 Most Useful Spanish Words (Beginners' Guides)"
## [6] "It Works"
## [7] "Brown Bear, Brown Bear, What Do You See?"
## [8] "Easy Spanish Phrase Book: Over 770 Basic Phrases for Everyday Use"
## [9] "The Prince"
## [10] "The TEMPEST"