setwd("C:\\Users\\Adiza Ojei\\Documents\\2021 Hard drive Docs\\HULT ONLINE CLASS\\Data Science R\\Intermediate R\\articles")
nm <- list.files(path="C:\\Users\\Adiza Ojei\\Documents\\2021 Hard drive Docs\\HULT ONLINE CLASS\\Data Science R\\Intermediate R\\articles")
Rpdf <- readPDF(control = list(text = "-layout"))
sdg7 <- Corpus(URISource(nm),
readerControl = list(reader = Rpdf))
sdg7.corpus <- tm_map(sdg7, tolower)
sdg7.corpus <- tm_map(sdg7, removeNumbers)
sdg7.corpus <- tm_map(sdg7, removePunctuation)
sdg7.corpus <- tm_map(sdg7, stripWhitespace)
sdg7.corpus <- tm_map(sdg7, removeWords, stopwords("english"))
sdg7.corpus <- tm_map(sdg7, removeWords, c("per","cent","including"))
sdg7.corpus
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 4
sdg7.tdm <- TermDocumentMatrix(sdg7.corpus,
control =
list(removePunctuation = TRUE,
stopwords = TRUE,
tolower = TRUE,
stemming = FALSE,
removeNumbers = TRUE,
bounds = list(global = c(3, Inf))))
sdg7.tdm
## <<TermDocumentMatrix (terms: 81, documents: 4)>>
## Non-/sparse entries: 275/49
## Sparsity : 15%
## Maximal term length: 14
## Weighting : term frequency (tf)
inspect(sdg7.tdm[1:10,])
## <<TermDocumentMatrix (terms: 10, documents: 4)>>
## Non-/sparse entries: 33/7
## Sparsity : 18%
## Maximal term length: 10
## Weighting : term frequency (tf)
## Sample :
## Docs
## Terms 11436UN-Energy - Goal 7 Ensure access to affordable, reliable, sustainable and modern energy for all.pdf
## access 10
## accounted 1
## achieve 1
## achieving 1
## affordable 3
## africa 9
## air 0
## also 0
## areas 1
## billion 3
## Docs
## Terms Goal-7.pdf SDG_7Final1.pdf SDG7_Brief.pdf
## access 6 32 3
## accounted 0 2 1
## achieve 0 4 2
## achieving 0 1 3
## affordable 3 3 4
## africa 2 5 0
## air 2 2 3
## also 1 1 4
## areas 0 1 1
## billion 4 13 2
inspect(sdg7.tdm[11:20,])
## <<TermDocumentMatrix (terms: 10, documents: 4)>>
## Non-/sparse entries: 37/3
## Sparsity : 7%
## Maximal term length: 11
## Weighting : term frequency (tf)
## Sample :
## Docs
## Terms 11436UN-Energy - Goal 7 Ensure access to affordable, reliable, sustainable and modern energy for all.pdf
## change 1
## clean 3
## climate 1
## cooking 4
## countries 2
## developed 3
## developing 4
## development 2
## economic 2
## economies 0
## Docs
## Terms Goal-7.pdf SDG_7Final1.pdf SDG7_Brief.pdf
## change 1 9 4
## clean 4 23 5
## climate 1 8 9
## cooking 1 16 1
## countries 3 25 3
## developed 0 4 1
## developing 0 10 2
## development 2 8 4
## economic 1 5 1
## economies 1 2 3
findFreqTerms(sdg7.tdm, lowfreq = 15, highfreq = Inf)
## [1] "access" "africa" "billion" "change" "clean"
## [6] "climate" "cooking" "countries" "developing" "development"
## [11] "efficiency" "electricity" "energy" "fuels" "global"
## [16] "people" "progress" "renewable" "sustainable" "technologies"
## [21] "will"
sdg7.frq <- findFreqTerms(sdg7.tdm, lowfreq = 15, highfreq = Inf)
findAssocs(sdg7.tdm, terms = "energy", corlimit = 0.75) #testing correlation to energy
## $energy
## improve renewable development efforts new
## 1.00 1.00 0.99 0.99 0.99
## poverty sector change clean global
## 0.99 0.99 0.98 0.98 0.98
## investment sustainable systems technologies countries
## 0.98 0.98 0.98 0.98 0.97
## ensure health sectors will achieve
## 0.97 0.97 0.97 0.97 0.96
## progress year cooking developing economic
## 0.96 0.96 0.95 0.94 0.94
## people transformation without transport access
## 0.94 0.94 0.94 0.93 0.92
## billion education emissions major power
## 0.92 0.91 0.91 0.91 0.91
## public water accounted eradication improved
## 0.91 0.91 0.90 0.90 0.90
## particularly system human lives many
## 0.90 0.87 0.81 0.81 0.75
dm.sdg7 <- as.matrix(sdg7.tdm[sdg7.frq, ]) #creating data matrix
dm.sdg7
## Docs
## Terms 11436UN-Energy - Goal 7 Ensure access to affordable, reliable, sustainable and modern energy for all.pdf
## access 10
## africa 9
## billion 3
## change 1
## clean 3
## climate 1
## cooking 4
## countries 2
## developing 4
## development 2
## efficiency 1
## electricity 6
## energy 23
## fuels 4
## global 6
## people 5
## progress 3
## renewable 4
## sustainable 4
## technologies 3
## will 2
## Docs
## Terms Goal-7.pdf SDG_7Final1.pdf SDG7_Brief.pdf
## access 6 32 3
## africa 2 5 0
## billion 4 13 2
## change 1 9 4
## clean 4 23 5
## climate 1 8 9
## cooking 1 16 1
## countries 3 25 3
## developing 0 10 2
## development 2 8 4
## efficiency 0 7 10
## electricity 11 18 1
## energy 16 83 33
## fuels 3 4 4
## global 1 19 8
## people 5 12 4
## progress 0 11 1
## renewable 2 19 6
## sustainable 6 20 9
## technologies 1 8 3
## will 1 14 7
sort_dm.sdg7 <- sort(rowSums(dm.sdg7),decreasing = TRUE)
df_sdg7 <- data.frame(word = names(sort_dm.sdg7), freq = sort_dm.sdg7) #creating data frame
set.seed(1234)
wordcloud(words = df_sdg7$word, freq = df_sdg7$freq, min.freq = 1,
max.words = 75, random.order = FALSE, rot.per = 0.35,
colors = brewer.pal(8, "Dark2"))
sdg7_barchart <- ggplot(data = df_sdg7, aes(x = word, y = freq, fill = word)) +
geom_col(position = position_dodge())
sdg7_barchart
ggraph(df_sdg7, layout = "fr") +
geom_edge_link()+
geom_node_point()+
geom_node_text(aes(label=name), vjust =1, hjust=1)
chart_sdg7 <- ggplot(df_sdg7, aes(x = word, y = freq, fill = word)) +
geom_col() + coord_polar("y", start = 0)
chart_sdg7
ggplot(df_sdg7,
aes(x = word,
y = freq,
col = word)) +
geom_point()