Fake news detection using R

library(tidyverse)
library(tidytext)
library(syuzhet)
library(stringr)
library(stringi)
#importing the dataset
news <- read_csv("fake_news_dataset.csv") 
glimpse(news)
## Rows: 12,999
## Columns: 20
## $ uuid               <chr> "6a175f46bcd24d39b3e962ad0f29936721db70db", "2bdc29…
## $ ord_in_thread      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author             <chr> "Barracuda Brigade", "reasoning with facts", "Barra…
## $ published          <dttm> 2016-10-26 18:41:00, 2016-10-29 05:47:11, 2016-10-…
## $ title              <chr> "Muslims BUSTED: They Stole Millions In Gov’t Benef…
## $ text               <chr> "Print They should pay all the back all the money p…
## $ language           <chr> "english", "english", "english", "english", "englis…
## $ crawled            <dttm> 2016-10-26 22:49:27, 2016-10-29 05:47:11, 2016-10-…
## $ site_url           <chr> "100percentfedup.com", "100percentfedup.com", "100p…
## $ country            <chr> "US", "US", "US", "US", "US", "US", "US", "US", "US…
## $ domain_rank        <dbl> 25689, 25689, 25689, 25689, 25689, 25689, 25689, 25…
## $ thread_title       <chr> "Muslims BUSTED: They Stole Millions In Gov’t Benef…
## $ spam_score         <dbl> 0.000, 0.000, 0.000, 0.068, 0.865, 0.000, 0.701, 0.…
## $ main_img_url       <chr> "http://bb4sp.com/wp-content/uploads/2016/10/Fullsc…
## $ replies_count      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ participants_count <dbl> 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, …
## $ likes              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ comments           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ shares             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ type               <chr> "bias", "bias", "bias", "bias", "bias", "bias", "bi…
head(news$text, 3)
## [1] "Print They should pay all the back all the money plus interest. The entire family and everyone who came in with them need to be deported asap. Why did it take two years to bust them? \nHere we go again …another group stealing from the government and taxpayers! A group of Somalis stole over four million in government benefits over just 10 months! \nWe’ve reported on numerous cases like this one where the Muslim refugees/immigrants commit fraud by scamming our system…It’s way out of control! More Related"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      
## [2] "Why Did Attorney General Loretta Lynch Plead The Fifth? Barracuda Brigade 2016-10-28 Print The administration is blocking congressional probe into cash payments to Iran. Of course she needs to plead the 5th. She either can’t recall, refuses to answer, or just plain deflects the question. Straight up corruption at its finest! \n100percentfedUp.com ; Talk about covering your ass! Loretta Lynch did just that when she plead the Fifth to avoid incriminating herself over payments to Iran…Corrupt to the core! Attorney General Loretta Lynch is declining to comply with an investigation by leading members of Congress about the Obama administration’s secret efforts to send Iran $1.7 billion in cash earlier this year, prompting accusations that Lynch has “pleaded the Fifth” Amendment to avoid incriminating herself over these payments, according to lawmakers and communications exclusively obtained by the Washington Free Beacon. \nSen. Marco Rubio (R., Fla.) and Rep. Mike Pompeo (R., Kan.) initially presented Lynch in October with a series of questions about how the cash payment to Iran was approved and delivered. \nIn an Oct. 24 response, Assistant Attorney General Peter Kadzik responded on Lynch’s behalf, refusing to answer the questions and informing the lawmakers that they are barred from publicly disclosing any details about the cash payment, which was bound up in a ransom deal aimed at freeing several American hostages from Iran. \nThe response from the attorney general’s office is “unacceptable” and provides evidence that Lynch has chosen to “essentially plead the fifth and refuse to respond to inquiries regarding [her]role in providing cash to the world’s foremost state sponsor of terrorism,” Rubio and Pompeo wrote on Friday in a follow-up letter to Lynch. More Related"
## [3] "Red State : \nFox News Sunday reported this morning that Anthony Weiner is cooperating with the FBI, which has re-opened (yes, lefties: “re-opened”) the investigation into Hillary Clinton’s classified emails. Watch as Chris Wallace reports the breaking news during the panel segment near the end of the show: \nAnd the news is breaking while we’re on the air. Our colleague Bret Baier has just sent us an e-mail saying he has two sources who say that Anthony Weiner, who also had co-ownership of that laptop with his estranged wife Huma Abedin, is cooperating with the FBI investigation, had given them the laptop, so therefore they didn’t need a warrant to get in to see the contents of said laptop. Pretty interesting development. \nTargets of federal investigations will often cooperate, hoping that they will get consideration from a judge at sentencing. Given Weiner’s well-known penchant for lying, it’s hard to believe that a prosecutor would give Weiner a deal based on an agreement to testify, unless his testimony were very strongly corroborated by hard evidence. But cooperation can take many forms — and, as Wallace indicated on this morning’s show, one of those forms could be signing a consent form to allow   the contents of devices that they could probably get a warrant for anyway. We’ll see if Weiner’s cooperation extends beyond that. More Related"

Some data cleaning

The dataset needs to be “cleaned” and “wrangled”

#bs and conspiracy news are also fake
news <- news %>% 
  mutate(type = case_match(type, c("bs", "conspiracy")~ "fake", 
          c("bias", "satire", "hate", "junksci", "state") ~ "real" ))%>% 
  filter(!is.na(type))
#count fake and real news 
news %>% group_by(type)%>% summarise(count = n())
## # A tibble: 2 × 2
##   type  count
##   <chr> <int>
## 1 fake  11922
## 2 real   1058

Now let’s apply function for finding question marks and exclamations and adding into our dataframe.

news <- news %>% 
  mutate(exc = str_count(text, "!")) %>% 
  mutate(que = str_count(text, "\\?") )%>% 
  mutate( word_count = stri_count_words(text)) 
  

news %>% group_by(type) %>% 
  summarize (exc_mark_prop = mean(exc/word_count*1000, na.rm = TRUE), que_mark_prop = mean(que/word_count*1000, na.rm = TRUE))
## # A tibble: 2 × 3
##   type  exc_mark_prop que_mark_prop
##   <chr>         <dbl>         <dbl>
## 1 fake           3.20          3.17
## 2 real           5.32          2.92
library(shiny)
## Warning: package 'shiny' was built under R version 4.3.3
# install.packages("wordcloud2")
library(wordcloud2)
## Warning: package 'wordcloud2' was built under R version 4.3.3
# install.packages("tm")
library(tm)
## Warning: package 'tm' was built under R version 4.3.3
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 4.3.3
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
# install.packages("colourpicker")
library(colourpicker)
## Warning: package 'colourpicker' was built under R version 4.3.3
## 
## Attaching package: 'colourpicker'
## The following object is masked from 'package:shiny':
## 
##     runExample
ui <- fluidPage(
  h1("Word Cloud"),
  h4(tags$a(href = "https://www.antoinesoetewey.com/", "Antoine Soetewey")),
  # Create a container for tab panels
  tabsetPanel(
    # Create a "Word cloud" tab
    tabPanel(
      title = "Word cloud",
      sidebarLayout(
        sidebarPanel(
          radioButtons(
            inputId = "source",
            label = "Word source",
            choices = c(
              "I Have a Dream speech by Martin Luther King Jr's" = "book",
              "Use your own words" = "own",
              "Upload a file" = "file"
            )
          ),
          hr(),
          # Add the selector for the language of the text
          selectInput(
            inputId = "language",
            label = "Remove stopwords in",
            choices = c("Danish", "Dutch", "English", "Finnish", "French", "German", "Hungarian", "Italian", "Norwegian", "Portuguese", "Russian", "Spanish", "Swedish"),
            multiple = FALSE,
            selected = "English"
          ),
          conditionalPanel(
            condition = "input.source == 'own'",
            textAreaInput("text", "Enter text", rows = 7)
          ),
          # Wrap the file input in a conditional panel
          conditionalPanel(
            # The condition should be that the user selects
            # "file" from the radio buttons
            condition = "input.source == 'file'",
            fileInput("file", "Select a file")
          ),
          hr(),
          checkboxInput("remove_words", "Remove specific words?", FALSE),
          conditionalPanel(
            condition = "input.remove_words == 1",
            textAreaInput("words_to_remove1", "Words to remove (one per line)", rows = 1)
          ),
          conditionalPanel(
            condition = "input.remove_words == 1 && input.words_to_remove1.length > 0",
            textAreaInput("words_to_remove2", "", rows = 1)
          ),
          conditionalPanel(
            condition = "input.remove_words == 1 && input.words_to_remove2.length > 0",
            textAreaInput("words_to_remove3", "", rows = 1)
          ),
          conditionalPanel(
            condition = "input.remove_words == 1 && input.words_to_remove3.length > 0",
            textAreaInput("words_to_remove4", "", rows = 1)
          ),
          conditionalPanel(
            condition = "input.remove_words == 1 && input.words_to_remove4.length > 0",
            textAreaInput("words_to_remove5", "", rows = 1)
          ),
          conditionalPanel(
            condition = "input.remove_words == 1 && input.words_to_remove5.length > 0",
            textAreaInput("words_to_remove6", "", rows = 1)
          ),
          conditionalPanel(
            condition = "input.remove_words == 1 && input.words_to_remove6.length > 0",
            textAreaInput("words_to_remove7", "", rows = 1)
          ),
          conditionalPanel(
            condition = "input.remove_words == 1 && input.words_to_remove7.length > 0",
            textAreaInput("words_to_remove8", "", rows = 1)
          ),
          conditionalPanel(
            condition = "input.remove_words == 1 && input.words_to_remove8.length > 0",
            textAreaInput("words_to_remove9", "", rows = 1)
          ),
          conditionalPanel(
            condition = "input.remove_words == 1 && input.words_to_remove9.length > 0",
            textAreaInput("words_to_remove10", "", rows = 1)
          ),
          hr(),
          numericInput("num", "Maximum number of words",
                       value = 100, min = 5
          ),
          hr(),
          colourInput("col", "Background color", value = "white"),
          hr(),
          HTML('<p>Report a <a href="https://github.com/AntoineSoetewey/word-cloud/issues">bug</a> or view the <a href="https://github.com/AntoineSoetewey/word-cloud/blob/master/app.R">code</a>. Back to <a href="https://www.antoinesoetewey.com/">www.antoinesoetewey.com</a>.</p>')
        ),
        mainPanel(
          wordcloud2Output("cloud"),
          # br(),
          # br(),
          # tags$a(href="https://www.antoinesoetewey.com/", "Back to www.antoinesoetewey.com"),
          br(),
          br()
        )
      )
    ),
    # Create an "About this app" tab
    tabPanel(
      title = "About this app",
      br(),
      "Instructions on how to use this Shiny app:",
      br(),
      br(),
      HTML("<ul><li>When uploading a file, make sure to upload a .csv or .txt file</li>
       <li>If it is a .csv file, there should be only one column containing all words or sentences (see below for example files)</li>
       <li>Numbers and punctuations will be automatically removed, as well as stop words in the language of your choice (via the dropdown selector)</li></ul>"),
      "Example files:",
      tags$a(href = "https://www.antoinesoetewey.com/files/ihaveadream.csv", "example.csv"),
      "and",
      tags$a(href = "https://www.antoinesoetewey.com/files/ihaveadream.txt", "example.txt"),
      br(),
      br(),
      em("Source: DataCamp"),
      br(),
      br(),
      HTML('<p>Report a <a href="https://github.com/AntoineSoetewey/word-cloud/issues">bug</a> or view the <a href="https://github.com/AntoineSoetewey/word-cloud/blob/master/app.R">code</a>. Back to <a href="https://www.antoinesoetewey.com/">www.antoinesoetewey.com</a>.</p>'),
      br(),
      br()
    )
  )
)

server <- function(input, output) {
  data_source <- reactive({
    if (input$source == "book") {
      data <- read.csv("ihaveadream.csv",
                       sep = "&",
                       stringsAsFactors = FALSE
      )
      data <- data[, 1]
    } else if (input$source == "own") {
      data <- input$text
    } else if (input$source == "file") {
      data <- input_file()
    }
    return(data)
  })
  
  input_file <- reactive({
    if (is.null(input$file)) {
      return("")
    }
    readLines(input$file$datapath)
  })
  
  create_wordcloud <- function(data, num_words = 100, background = "white") {
    
    # If text is provided, convert it to a dataframe of word frequencies
    if (is.character(data)) {
      corpus <- Corpus(VectorSource(data))
      corpus <- tm_map(corpus, tolower)
      corpus <- tm_map(corpus, removePunctuation)
      corpus <- tm_map(corpus, removeNumbers)
      corpus <- tm_map(corpus, removeWords, stopwords(tolower(input$language)))
      corpus <- tm_map(corpus, removeWords, c(input$words_to_remove1))
      corpus <- tm_map(corpus, removeWords, c(input$words_to_remove2))
      corpus <- tm_map(corpus, removeWords, c(input$words_to_remove3))
      corpus <- tm_map(corpus, removeWords, c(input$words_to_remove4))
      corpus <- tm_map(corpus, removeWords, c(input$words_to_remove5))
      corpus <- tm_map(corpus, removeWords, c(input$words_to_remove6))
      corpus <- tm_map(corpus, removeWords, c(input$words_to_remove7))
      corpus <- tm_map(corpus, removeWords, c(input$words_to_remove8))
      corpus <- tm_map(corpus, removeWords, c(input$words_to_remove9))
      corpus <- tm_map(corpus, removeWords, c(input$words_to_remove10))
      tdm <- as.matrix(TermDocumentMatrix(corpus))
      data <- sort(rowSums(tdm), decreasing = TRUE)
      data <- data.frame(word = names(data), freq = as.numeric(data))
    }
    
    # Make sure a proper num_words is provided
    if (!is.numeric(num_words) || num_words < 3) {
      num_words <- 3
    }
    
    # Grab the top n most common words
    data <- head(data, n = num_words)
    if (nrow(data) == 0) {
      return(NULL)
    }
    
    wordcloud2(data, backgroundColor = background)
  }
  output$cloud <- renderWordcloud2({
    create_wordcloud(data_source(),
                     num_words = input$num,
                     background = input$col
    )
  })
}

shinyApp(ui = ui, server = server)
Shiny applications not supported in static R Markdown documents