mining.R

library("tm")

## Loading required package: NLP

library("ggplot2")

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

library("SnowballC")
library("RColorBrewer")
library("syuzhet")
library("wordcloud")
print('Done Importing ibrary')

## [1] "Done Importing ibrary"

library(readxl)
text_mining <- read_excel("text mining.xlsx")

## New names:
## • `` -> `...1`

# Load the data as a corpus
TextDoc  <- Corpus(VectorSource(text_mining$tweet))
length(text_mining)

## [1] 5

colnames(text_mining)

## [1] "...1"   "date"   "target" "insult" "tweet"

head(text_mining)

## # A tibble: 6 × 5
##   ...1  date  target                   insult                              tweet
##   <chr> <chr> <chr>                    <chr>                               <chr>
## 1 1     41921 thomas-frieden           fool                                "Can…
## 2 2     41921 thomas-frieden           DOPE                                "Can…
## 3 3     42171 politicians              all talk and no action              "Big…
## 4 4     42179 ben-cardin               It's politicians like Cardin that … "Pol…
## 5 5     42179 neil-young               total hypocrite                     "For…
## 6 6     42179 rockin-in-the-free-world didn't love it                      ".@N…

# Build a term-document matrix
TextDoc_dtm <- TermDocumentMatrix(TextDoc)
dtm_m <- as.matrix(TextDoc_dtm)

# Sort by descearing value of frequency
dtm_v <- sort(rowSums(dtm_m),decreasing=TRUE)
dtm_d <- data.frame(word = names(dtm_v),freq=dtm_v)
# Display the top 5 most frequent words
head(dtm_d, 5)

##      word  freq
## the   the 18004
## and   and  8185
## for   for  3578
## that that  3294
## they they  3287

# This is the barchart representation of the Top 5 Words most appeared
barplot(dtm_d[1:5,]$freq, las = 2, names.arg = dtm_d[1:5,]$word,
        col ="pink", main ="Top 5 most frequent words",
        ylab = "Word frequencies")

# Creating Pie Chart of Top 5 Most word appeared
x <- c(15696, 11329, 10921,6580,6019)
labels <- c("fake", "news", "will","democrat democrat","peopl peopl")

# Plot the chart.
pie(x, labels, main = "Pie Chart of Top 5 Most word appeared", col = rainbow(length(x)))
legend("topright", c("fake", "news", "will","democrat democrat","peopl peopl"), cex = 0.8,fill = rainbow(length(x)))

mining.R

Acer

2024-11-01