Part 1: Extract all articles from the TechCrunch main page of their web site. Create a data frame of these articles.
library(rvest)
## Loading required package: xml2
library(magrittr)
url <- read_html("https://techcrunch.com/")
#Scrape the website for the movie rating
title <- url %>%
html_nodes(".post-block__title") %>%
html_text() %>%
as.character()
abstract <- url %>%
html_nodes(".post-block__content") %>%
html_text() %>%
as.character()
title_link <- url %>%
html_nodes(".post-block__title__link") %>%
html_attr("href") %>%
as.character()
#article=data.frame()
linktext<-function(x){
url_title<- read_html(x)
content <- url_title %>% html_nodes('.text')%>% html_text()%>%as.character()
content=unlist(content)
}
#bag<-lapply(title_link, FUN=linktext)
bag=unlist(abstract)
bag <- gsub( "[\"(){}<>\r\n\t]", " ", bag)
bag<- gsub( "\\s+", " ", bag)
bag<- gsub( "… Read More", "", bag)
article=data.frame(matrix(bag, ncol=1, byrow=T))
names(article) <- c("content")
#bag
#url_title<- html(title_link[1])
Part 2: Create a corpus of these articles, and then clean it of numbers, punctuation, stopwords, and stem the documents as well.
library(tm)
## Loading required package: NLP
text_corpus = Corpus(VectorSource(article$content))
#inspect(text_corpus)
#print(as.character(text_corpus[[1]]))
text_corpus_clean = tm_map(text_corpus,removeWords,stopwords("english"))
text_corpus_clean = tm_map(text_corpus_clean,removePunctuation)
text_corpus_clean = tm_map(text_corpus_clean,removeNumbers)
text_corpus_clean = tm_map(text_corpus_clean,stemDocument)
print(as.character(text_corpus_clean[[4]]))
## Warning in as.POSIXlt.POSIXct(Sys.time(), tz = "GMT"): unknown timezone
## 'default/America/Los_Angeles'
## [1] "Coinbas take look new cryptocurr add exchang The list kind preannounc startup say s explor ad asset"
Part 3: Create a term document matrix from the corpus and print the top 50 lines. What is the dimension of the TDM?
tdm = TermDocumentMatrix(text_corpus_clean,control=list(minWordLength=1))
dim(tdm)
## [1] 305 20
tdm_matrix=as.matrix(tdm)
tdm_matrix[1:10,]
## Docs
## Terms 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## basic 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## charac 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## disney 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## hair 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## leader 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## like 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## main 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
## make 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0
## movi 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## nich 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Part 4: From the TDM, make a network adjacency matrix of words. Assume two words are linked once if they appear in the same document. If they co-occur in say, three documents, then they are connected by strength 3. Based on co-occurrence withim documents, create the adjacency matrix of words, where the words are nodes, and their co-occurrences provide the data for the links.
tdm_matrix[tdm_matrix>=1] <- 1 # change the TDM into Boolean matrix
# term adjacency matrix
term_matrix <- tdm_matrix %*% t(tdm_matrix)
term_matrix[1:10,1:10]
## Terms
## Terms basic charac disney hair leader like main make movi nich
## basic 1 1 1 1 1 1 1 1 1 1
## charac 1 1 1 1 1 1 1 1 1 1
## disney 1 1 1 1 1 1 1 1 1 1
## hair 1 1 1 1 1 1 1 1 1 1
## leader 1 1 1 1 1 1 1 1 1 1
## like 1 1 1 1 1 1 1 1 1 1
## main 1 1 1 1 1 1 2 1 1 1
## make 1 1 1 1 1 1 1 2 1 1
## movi 1 1 1 1 1 1 1 1 1 1
## nich 1 1 1 1 1 1 1 1 1 1
Part 5: Convert the Adjacency Matrix into a Edge List (This is just a two column listing of nodes from and nodes to.)
library(igraph)
##
## Attaching package: 'igraph'
## The following object is masked from 'package:magrittr':
##
## %>%
## The following object is masked from 'package:rvest':
##
## %>%
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
g <- graph.adjacency(term_matrix, weighted=T, mode = "undirected")
g <- simplify(g)
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)
ed=get.edgelist(g, names=TRUE)
ed[1:10,]
## [,1] [,2]
## [1,] "basic" "charac"
## [2,] "basic" "disney"
## [3,] "basic" "hair"
## [4,] "basic" "leader"
## [5,] "basic" "like"
## [6,] "basic" "main"
## [7,] "basic" "make"
## [8,] "basic" "movi"
## [9,] "basic" "nich"
## [10,] "basic" "simul"
Part 6: Using the edge list, create a spring force plot using D3. Redo the same plot, but zero out all edges which have value 1, and keep all edges with values 2 or greater. How different are the two plots, describe the difference.
Answer:The spring force plot with value 2 or greater are more scattered than the spring force plot with value 1 or greater.
library(reshape2)
library(networkD3)
library(htmlwidgets)
##
## Attaching package: 'htmlwidgets'
## The following object is masked from 'package:networkD3':
##
## JS
el <- melt(term_matrix)
el<-el[el$value!=0,]
colnames(el)<-c("V1","from","value")
term_number <- as.matrix(tdm$dimnames$Terms)
term_number <- as.data.frame(term_number)
term_number$code <- seq(1:length(term_number$V1))
el=merge(el,term_number,by="V1")
colnames(el)<-c("to","V1","value","to_code")
el=merge(el,term_number,by="V1")
colnames(el)<-c("to","from","value","to_code","from_code")
#subsetting to only the first 1000 terms
el_sub=el[1:1000,]
el1=el_sub[,3:5]
colnames(el1)<-c("value","target","source")
#nodes = data.frame(unique(el_sub$from, el_sub$to))
nodes = data.frame(term_number[,1])
names(nodes) = "name"
nodes$group = ceiling(3*runif(length(nodes$name)))
#create forcenetwork for orginal data
links1 = el1[,2:3]-1
links1$value = el1$value
y=forceNetwork(Links = links1, Nodes = nodes, Source = "source",
Target = "target", Value = "value", NodeID = "name",
Group = "group", opacity = 0.8, fontSize = 75)
#saveWidget(y, file="\\springforceplot1.html")
y
#getting rid values 1
el2=el_sub[el_sub$value!=1,]
el3=el2[,3:5]
colnames(el3)<-c("value","target","source")
#nodes = data.frame(unique(el2$from, el2$to))
nodes = data.frame(term_number[,1])
names(nodes) = "name"
nodes$group = ceiling(3*runif(length(nodes$name)))
#create force network for orginal data
links2 = el3[,2:3]-1
links2$value = el3$value
x=forceNetwork(Links = links2, Nodes = nodes, Source = "source",
Target = "target", Value = "value", NodeID = "name",
Group = "group", opacity = 0.8, fontSize = 75)
## Warning: It looks like Source/Target is not zero-indexed. This is required
## in JavaScript and so your plot may not render.
#saveWidget(x, file="\\springforceplot2.html")
#please see the seperate file for the plot
x
Part 7: Plot the degree distribution of your word network.
dd = degree.distribution(g) # for each of the 30 nodes, what is the degree of connections
dd = as.matrix(dd)
d = as.matrix(seq(0,max(degree(g)))) #sequence of numbers of degree
plot(d,dd,type="l")
Part 8: Calculate the centrality of the words in the network. What are the top 10 central words? What conclusions can you state from this?
The central words in this network tells us that based from the words in the excerpt, there appears to be a lot of teams and companies mentioned.This makes sense since TechCrunch specifies in articles about current events of tech companies and teams. In addition, the words ‘percent’, ‘better’ and ‘perform’ are also among the top 10 words, which are common words to describe the performance and status of companies. The word ‘round’ is also a popular term, most likely from the term Series A or B ‘round’, broadcasting a startup company’s financial status and success in Silicon Valley.
cent = evcent(g)$vector
print("Normalized Centrality Scores")
## [1] "Normalized Centrality Scores"
print(cent)
## basic charac disney
## 0.03718961 0.03718961 0.03718961
## hair leader like
## 0.03718961 0.03718961 0.03718961
## main make movi
## 0.15885735 0.21837583 0.03718961
## nich simul someth
## 0.03718961 0.03718961 0.16782825
## talent tangl unequivoc
## 0.03718961 0.03718961 0.03718961
## use way world
## 0.08224514 0.03718961 0.03718961
## aboard aifocus bring
## 0.03469597 0.03469597 0.03469597
## chip continu develop
## 0.03469597 0.03469597 0.03469597
## devot director engin
## 0.03469597 0.03469597 0.03469597
## facebook googl product
## 0.03469597 0.03469597 0.03469597
## resourc senior work
## 0.03469597 0.03469597 0.58524440
## alon compar cowork
## 0.02627166 0.02627166 0.02627166
## distract endless facefac
## 0.02627166 0.02627166 0.02627166
## gape interact offic
## 0.02627166 0.02627166 0.49568827
## open share space
## 0.02627166 0.02627166 0.02627166
## tradit add asset
## 0.02627166 0.13959367 0.13959367
## coinbas cryptocurr exchang
## 0.13959367 0.13959367 0.13959367
## explor kind list
## 0.13959367 0.40980293 0.13959367
## look new preannounc
## 0.57018532 1.00000000 0.13959367
## say startup take
## 0.15452217 0.22253204 0.13959367
## the american civil
## 0.69556304 0.04771997 0.04771997
## execut facial give
## 0.04771997 0.04771997 0.21917627
## govern guidanc heel
## 0.04771997 0.04771997 0.04771997
## liberti now plead
## 0.04771997 0.47001691 0.04771997
## recognit technolog union
## 0.04771997 0.12893211 0.04771997
## weigh among butterfli
## 0.04771997 0.30194001 0.13607565
## common compani complaint
## 0.13607565 0.94728369 0.13607565
## far focus import
## 0.13607565 0.13607565 0.13607565
## keyboard later macbook
## 0.13607565 0.13607565 0.13607565
## noiseit right shift
## 0.13607565 0.13607565 0.13607565
## sinc spec talk
## 0.13607565 0.13607565 0.13607565
## user well accus
## 0.13607565 0.13607565 0.17564608
## alleg also campaign
## 0.36051761 0.46554664 0.17564608
## clinton dnc file
## 0.17564608 0.17564608 0.30859053
## hack hacker hillari
## 0.36051761 0.17564608 0.17564608
## indict nefari note
## 0.17564608 0.17564608 0.17564608
## paid russian thing
## 0.17564608 0.36051761 0.46086068
## today undermin abil
## 0.17564608 0.17564608 0.12681412
## alas exact full
## 0.12681412 0.12681412 0.26604681
## game hap huge
## 0.12681412 0.12681412 0.12681412
## last live meltdown
## 0.12681412 0.12681412 0.12681412
## one point sell
## 0.64293359 0.12681412 0.12681412
## servic sport stream
## 0.12681412 0.12681412 0.12681412
## want when adjust
## 0.30518796 0.30518796 0.14785160
## app best brought
## 0.14785160 0.14785160 0.14785160
## camera control devic
## 0.14785160 0.14785160 0.14785160
## dslrlike featur glass
## 0.14785160 0.14785160 0.14785160
## includ iso manual
## 0.14785160 0.14785160 0.14785160
## mobil moment pro
## 0.14785160 0.14785160 0.14785160
## shutter speed accord
## 0.14785160 0.14785160 0.31673076
## chowli led math
## 0.14294162 0.14294162 0.14294162
## million near par
## 0.14294162 0.14294162 0.14294162
## pointsal rais restaur
## 0.14294162 0.14294162 0.14294162
## round sec system
## 0.14294162 0.14294162 0.14294162
## target total ventur
## 0.14294162 0.14294162 0.14294162
## alaska anchorag and
## 0.20497243 0.20497243 0.44949818
## blockbust closur countri
## 0.20497243 0.20497243 0.20497243
## daili fairbank impend
## 0.20497243 0.20497243 0.20497243
## just locat remain
## 0.95505587 0.20497243 0.20497243
## report singl store
## 0.20497243 0.20497243 0.20497243
## will with yester
## 0.20497243 0.20497243 0.20497243
## good identifi index
## 0.08538906 0.08538906 0.08538906
## invad map neglig
## 0.08538906 0.08538906 0.08538906
## privaci problem protect
## 0.08538906 0.08538906 0.08538906
## push technologist terribl
## 0.08538906 0.08538906 0.08538906
## theyr charg day
## 0.08538906 0.19655095 0.37273646
## depart intellig justic
## 0.19655095 0.19655095 0.19655095
## level meet presid
## 0.19655095 0.19655095 0.19655095
## putin set trump
## 0.19655095 0.19655095 0.19655095
## vladimir alreadi amazon
## 0.19655095 0.01993443 0.20166211
## antitrust bluster chanc
## 0.01993443 0.01993443 0.01993443
## come crosshair hous
## 0.01993443 0.01993443 0.01993443
## investig simpli slim
## 0.01993443 0.01993443 0.01993443
## threat trumpian white
## 0.01993443 0.01993443 0.01993443
## chad compet complic
## 0.09014757 0.09014757 0.09014757
## comput even fiveyear
## 0.26559337 0.09014757 0.09014757
## fou giant industri
## 0.09014757 0.12393812 0.26559337
## longstand namesak old
## 0.09014757 0.09014757 0.09014757
## quantum rigetti subject
## 0.09014757 0.09014757 0.09014757
## tackl tech announc
## 0.09014757 0.09014757 0.30498252
## apple china clean
## 0.30498252 0.30498252 0.30498252
## energi facil foster
## 0.30498252 0.30498252 0.30498252
## fund invest isnt
## 0.30498252 0.30498252 0.30498252
## supplier switch tri
## 0.30498252 0.30498252 0.30498252
## usag big get
## 0.30498252 0.18826077 0.45694288
## holiday hour juli
## 0.18826077 0.18826077 0.18826077
## kick long monday
## 0.18826077 0.18826077 0.18826077
## stop this year
## 0.18826077 0.21897256 0.36062782
## actual area can
## 0.28348533 0.28348533 0.28348533
## contractor especi find
## 0.28348533 0.28348533 0.28348533
## first for may
## 0.45286426 0.28348533 0.28348533
## oper region spin
## 0.28348533 0.28348533 0.28348533
## start step collector
## 0.28348533 0.28348533 0.18405002
## dead dont experienc
## 0.18405002 0.18405002 0.18405002
## gartner marketshar metric
## 0.18405002 0.18405002 0.18405002
## might person shipment
## 0.18405002 0.18405002 0.18405002
## worldwid african artisan
## 0.18405002 0.03780568 0.03780568
## deliveri dhl ecommerc
## 0.03780568 0.03780568 0.03780568
## global launch mallforafrica
## 0.03780568 0.03780568 0.03780568
## marketplaceafricacom merchant onlin
## 0.03780568 0.03780568 0.03780568
## retail select site
## 0.03780568 0.03780568 0.03780568
## stage week
## 0.03780568 0.03780568
sorted_cent = sort(cent,decreasing=TRUE,index.return=TRUE)
Scent = sorted_cent$x
Scent[1:3]
## new just compani
## 1.0000000 0.9550559 0.9472837
Part 9: Form “communities” of words, and state any regularities you may see from these communities. You may decide on the setting for the size of these communities as you prefer.
Answer: The regularity is that within each community, there appears to be a common theme. Some examples include:
Community 12: seems to be more technical, with words such as ‘stackoverflow’, ‘robot’, ‘program’, ‘corpus’.
Community 5: appears to be about financing companies, with terms such as ‘acquisit’, ‘marketplac’, ‘stock’
wtc = walktrap.community(g)
res=membership(wtc)
#print(res)
sort(res,decreasing=TRUE)
## butterfli common complaint
## 15 15 15
## far focus import
## 15 15 15
## keyboard later macbook
## 15 15 15
## noiseit right shift
## 15 15 15
## sinc spec talk
## 15 15 15
## user well good
## 15 15 14
## identifi index invad
## 14 14 14
## map neglig privaci
## 14 14 14
## problem protect push
## 14 14 14
## technologist terribl theyr
## 14 14 14
## abil alas exact
## 13 13 13
## game hap huge
## 13 13 13
## last live meltdown
## 13 13 13
## point sell servic
## 13 13 13
## sport stream alaska
## 13 13 12
## anchorag blockbust closur
## 12 12 12
## countri daili fairbank
## 12 12 12
## impend locat remain
## 12 12 12
## report singl store
## 12 12 12
## will with yester
## 12 12 12
## adjust app best
## 11 11 11
## brought camera control
## 11 11 11
## devic dslrlike featur
## 11 11 11
## glass includ iso
## 11 11 11
## manual mobil moment
## 11 11 11
## pro shutter speed
## 11 11 11
## alon compar cowork
## 10 10 10
## distract endless facefac
## 10 10 10
## gape interact open
## 10 10 10
## share space tradit
## 10 10 10
## chowli led math
## 9 9 9
## million near par
## 9 9 9
## pointsal rais restaur
## 9 9 9
## round sec system
## 9 9 9
## target total ventur
## 9 9 9
## aboard aifocus bring
## 8 8 8
## chip continu develop
## 8 8 8
## devot director engin
## 8 8 8
## facebook googl product
## 8 8 8
## resourc senior startup
## 8 8 7
## chad compet complic
## 7 7 7
## comput even fiveyear
## 7 7 7
## fou industri longstand
## 7 7 7
## namesak old quantum
## 7 7 7
## rigetti subject tackl
## 7 7 7
## tech american civil
## 7 6 6
## execut facial govern
## 6 6 6
## guidanc heel liberti
## 6 6 6
## plead recognit technolog
## 6 6 6
## union weigh basic
## 6 6 5
## charac disney hair
## 5 5 5
## leader like main
## 5 5 5
## make movi nich
## 5 5 5
## simul someth talent
## 5 5 5
## tangl unequivoc use
## 5 5 5
## way world give
## 5 5 4
## giant this african
## 4 4 4
## artisan deliveri dhl
## 4 4 4
## ecommerc global launch
## 4 4 4
## mallforafrica marketplaceafricacom merchant
## 4 4 4
## onlin retail select
## 4 4 4
## site stage week
## 4 4 4
## say alreadi amazon
## 3 3 3
## antitrust bluster chanc
## 3 3 3
## come crosshair hous
## 3 3 3
## investig simpli slim
## 3 3 3
## threat trumpian white
## 3 3 3
## add asset coinbas
## 2 2 2
## cryptocurr exchang explor
## 2 2 2
## kind list look
## 2 2 2
## new preannounc take
## 2 2 2
## thing full one
## 2 2 2
## want when and
## 2 2 2
## just day big
## 2 2 2
## get holiday hour
## 2 2 2
## juli kick long
## 2 2 2
## monday stop year
## 2 2 2
## actual area can
## 2 2 2
## contractor especi find
## 2 2 2
## first for may
## 2 2 2
## oper region spin
## 2 2 2
## start step collector
## 2 2 2
## dead dont experienc
## 2 2 2
## gartner marketshar metric
## 2 2 2
## might person shipment
## 2 2 2
## worldwid work offic
## 2 1 1
## the now among
## 1 1 1
## compani accus alleg
## 1 1 1
## also campaign clinton
## 1 1 1
## dnc file hack
## 1 1 1
## hacker hillari indict
## 1 1 1
## nefari note paid
## 1 1 1
## russian today undermin
## 1 1 1
## accord charg depart
## 1 1 1
## intellig justic level
## 1 1 1
## meet presid putin
## 1 1 1
## set trump vladimir
## 1 1 1
## announc apple china
## 1 1 1
## clean energi facil
## 1 1 1
## foster fund invest
## 1 1 1
## isnt supplier switch
## 1 1 1
## tri usag
## 1 1
Part 10: What is the diameter of the network? Why is this interesting?
Answer:The diameter of the network is 4 and this is interesting because we want to know how quickly things are spreading in a network and a diameter of 3 suggests that it is spreading pretty quickly.
print(diameter(g))
## [1] 4