## [1] TRUE
## [1] TRUE
There are duplicated rows, let’s get rid of it!
fake <- fake %>% distinct()
real <- real %>% distinct()
# Check duplicated rows again
fake %>% duplicated() %>% any()## [1] FALSE
## [1] FALSE
Cool, the data now has no duplicated rows
## [1] FALSE
## [1] FALSE
There are no missing values!
I will store it in a new variable named news
set.seed(1)
news <- rbind(fake, real)
news <- news[sample(nrow(news)), ] %>%
mutate(is_fake = as.factor(is_fake))
news %>% head()I will paste the title to the text
Don’t worry about this chunk, this is just some data aggregations for the EDA visualization
# FAKE NEWS
fake_viz_daily_df <- read.csv("data_input/Fake.csv")%>%
select(date) %>%
mutate(date = mdy(date) + years(5)) %>% # I manipulate the year just to look updated and more relevant :D
filter(complete.cases(.)) %>%
arrange(date) %>%
group_by(date) %>%
summarise(daily_fake_news = n()) %>%
mutate(tooltip_daily = glue("{date}
{daily_fake_news} fake news"))
fake_viz_monthly_df <- read.csv("data_input/Fake.csv")%>%
select(date) %>%
mutate(date = mdy(date) + years(5),
year = year(date),
month = month(date, label = T, abbr = F)) %>%
filter(complete.cases(.)) %>%
arrange(date) %>%
group_by(month, year) %>%
summarise(monthly_fake_news = n()) %>%
ungroup() %>%
arrange(year) %>%
mutate(month_year = my(paste(month, year)),
tooltip_monthly = glue("{month} {year}
{monthly_fake_news} fake news"))
# REAL NEWS
real_viz_daily_df <- read.csv("data_input/True.csv")%>%
select(date) %>%
mutate(date = mdy(date) + years(5)) %>%
filter(complete.cases(.)) %>%
arrange(date) %>%
group_by(date) %>%
summarise(daily_real_news = n()) %>%
mutate(tooltip_daily = glue("{date}
{daily_real_news} real news"))
real_viz_monthly_df <- read.csv("data_input/True.csv")%>%
select(date) %>%
mutate(date = mdy(date) + years(5),
year = year(date),
month = month(date, label = T, abbr = F)) %>%
filter(complete.cases(.)) %>%
arrange(date) %>%
group_by(month, year) %>%
summarise(monthly_real_news = n()) %>%
ungroup() %>%
arrange(year) %>%
mutate(month_year = my(paste(month, year)),
tooltip_monthly = glue("{month} {year}
{monthly_real_news} real news"))plot_ly() %>%
add_lines(data = fake_viz_daily_df, x = ~date, y = ~daily_fake_news, color = I("#FF8BA0"),
name = "Fake News", text = ~tooltip_daily, hoverinfo = "text",
line = list(width = 1)) %>%
add_lines(data = real_viz_daily_df, x = ~date, y = ~daily_real_news, color = I("#77CCFF"),
name = "Real News", text = ~tooltip_daily, hoverinfo = "text",
line = list(width = 1)) %>%
layout(title = "\nTotal News Daily Trend",
xaxis = list(title = ""),
yaxis = list(title = "Total News"),
showlegend = TRUE,
legend = list(title = NULL))A few months back, the total number of real news surpassed the number fake news
plot_ly() %>%
add_lines(data = fake_viz_monthly_df, x = ~month_year, y = ~monthly_fake_news, color = I("#FF8BA0"),
name = "Fake News", text = ~tooltip_monthly, hoverinfo = "text",
line = list(width = 4)) %>%
add_lines(data = real_viz_monthly_df, x = ~month_year, y = ~monthly_real_news, color = I("#77CCFF"),
name = "Real News", text = ~tooltip_monthly, hoverinfo = "text",
line = list(width = 4)) %>%
layout(title = "\nTotal News Monthly Trend",
xaxis = list(title = ""),
yaxis = list(title = "Total News"),
showlegend = TRUE,
legend = list(title = NULL))However, when we look further back, the average number of fake news is generally higher than real news
cleaned_text_corpus <- cleanText(news$text, lang = "en", as.corpus = T) # This function is from my personal package## [1] "Original:"
## [1] "Next U.S. disaster aid approval likely in December: Senator Cornyn WASHINGTON (Reuters) - The next round of aid to help rebuild Puerto Rico and U.S. states after destructive hurricanes and wildfires is unlikely to be considered by Congress until it takes up a catch-all spending bill that faces a Dec. 8 deadline for passage, the No. 2 Senate Republican said on Wednesday. John Cornyn said a crush of other matters would take up time, so “we probably won’t get to that” (disaster aid) until consideration of a massive bill to fund federal agencies through next Sept. 30. The Trump administration has said it will submit to Congress a third round of disaster aid in mid-November that could total tens of billions of dollars. Congress already has provided over $51 billion Currently, the Republican-controlled Congress is mainly focused on passing a major tax reform bill by year’s end. The disaster aid would help recovery efforts in Texas, Florida, Puerto Rico and the U.S. Virgin Islands following Hurricanes Harvey, Irma and Maria in August and September. Six weeks after Maria tore through Puerto Rico, the island is struggling to pick up the pieces, and about two-thirds of its residents remain without power. The U.S. territory declared bankruptcy earlier this year and the storm has plunged its economy into deeper uncertainty. The government-wide spending bill could also become a magnet for measures that Democrats, who are in the minority in Congress, want to attach. Those could include a bipartisan plan to temporarily bolster the Affordable Care Act, commonly known as “Obamacare.” Democrats also are hinting that the end-of-year measure could be an opportunity to force passage of legislation protecting from deportation young “Dreamers,” people brought illegally to the United States as children. "
## [1] "Lowered:"
## [1] "next u.s. disaster aid approval likely in december: senator cornyn washington (reuters) - the next round of aid to help rebuild puerto rico and u.s. states after destructive hurricanes and wildfires is unlikely to be considered by congress until it takes up a catch-all spending bill that faces a dec. 8 deadline for passage, the no. 2 senate republican said on wednesday. john cornyn said a crush of other matters would take up time, so “we probably won’t get to that” (disaster aid) until consideration of a massive bill to fund federal agencies through next sept. 30. the trump administration has said it will submit to congress a third round of disaster aid in mid-november that could total tens of billions of dollars. congress already has provided over $51 billion currently, the republican-controlled congress is mainly focused on passing a major tax reform bill by year’s end. the disaster aid would help recovery efforts in texas, florida, puerto rico and the u.s. virgin islands following hurricanes harvey, irma and maria in august and september. six weeks after maria tore through puerto rico, the island is struggling to pick up the pieces, and about two-thirds of its residents remain without power. the u.s. territory declared bankruptcy earlier this year and the storm has plunged its economy into deeper uncertainty. the government-wide spending bill could also become a magnet for measures that democrats, who are in the minority in congress, want to attach. those could include a bipartisan plan to temporarily bolster the affordable care act, commonly known as “obamacare.” democrats also are hinting that the end-of-year measure could be an opportunity to force passage of legislation protecting from deportation young “dreamers,” people brought illegally to the united states as children. "
## [1] "Numbers removed:"
## [1] "next u.s. disaster aid approval likely in december: senator cornyn washington (reuters) - the next round of aid to help rebuild puerto rico and u.s. states after destructive hurricanes and wildfires is unlikely to be considered by congress until it takes up a catch-all spending bill that faces a dec. deadline for passage, the no. senate republican said on wednesday. john cornyn said a crush of other matters would take up time, so “we probably won’t get to that” (disaster aid) until consideration of a massive bill to fund federal agencies through next sept. . the trump administration has said it will submit to congress a third round of disaster aid in mid-november that could total tens of billions of dollars. congress already has provided over $ billion currently, the republican-controlled congress is mainly focused on passing a major tax reform bill by year’s end. the disaster aid would help recovery efforts in texas, florida, puerto rico and the u.s. virgin islands following hurricanes harvey, irma and maria in august and september. six weeks after maria tore through puerto rico, the island is struggling to pick up the pieces, and about two-thirds of its residents remain without power. the u.s. territory declared bankruptcy earlier this year and the storm has plunged its economy into deeper uncertainty. the government-wide spending bill could also become a magnet for measures that democrats, who are in the minority in congress, want to attach. those could include a bipartisan plan to temporarily bolster the affordable care act, commonly known as “obamacare.” democrats also are hinting that the end-of-year measure could be an opportunity to force passage of legislation protecting from deportation young “dreamers,” people brought illegally to the united states as children. "
## [1] "Stopwords removed:"
## [1] "next u.s. disaster aid approval likely december: senator cornyn washington (reuters) - next round aid help rebuild puerto rico u.s. states destructive hurricanes wildfires unlikely considered congress takes catch- spending bill faces dec. deadline passage, . senate republican said wednesday. john cornyn said crush matters take time, “ probably won’t get ” (disaster aid) consideration massive bill fund federal agencies next sept. . trump administration said will submit congress third round disaster aid mid-november total tens billions dollars. congress already provided $ billion currently, republican-controlled congress mainly focused passing major tax reform bill year’s end. disaster aid help recovery efforts texas, florida, puerto rico u.s. virgin islands following hurricanes harvey, irma maria august september. six weeks maria tore puerto rico, island struggling pick pieces, two-thirds residents remain without power. u.s. territory declared bankruptcy earlier year storm plunged economy deeper uncertainty. government-wide spending bill also become magnet measures democrats, minority congress, want attach. include bipartisan plan temporarily bolster affordable care act, commonly known “obamacare.” democrats also hinting end--year measure opportunity force passage legislation protecting deportation young “dreamers,” people brought illegally united states children. "
## [1] "Punctuation removed:"
## [1] "next us disaster aid approval likely december senator cornyn washington reuters next round aid help rebuild puerto rico us states destructive hurricanes wildfires unlikely considered congress takes catch spending bill faces dec deadline passage senate republican said wednesday john cornyn said crush matters take time “ probably won’t get ” disaster aid consideration massive bill fund federal agencies next sept trump administration said will submit congress third round disaster aid midnovember total tens billions dollars congress already provided billion currently republicancontrolled congress mainly focused passing major tax reform bill year’s end disaster aid help recovery efforts texas florida puerto rico us virgin islands following hurricanes harvey irma maria august september six weeks maria tore puerto rico island struggling pick pieces twothirds residents remain without power us territory declared bankruptcy earlier year storm plunged economy deeper uncertainty governmentwide spending bill also become magnet measures democrats minority congress want attach include bipartisan plan temporarily bolster affordable care act commonly known “obamacare” democrats also hinting endyear measure opportunity force passage legislation protecting deportation young “dreamers” people brought illegally united states children "
## [1] "Stemmed:"
## [1] "next us disast aid approv like decemb senat cornyn washington reuter next round aid help rebuild puerto rico us state destruct hurrican wildfir unlik consid congress take catch spend bill face dec deadlin passag senat republican said wednesday john cornyn said crush matter take time “ probabl won’t get ” disast aid consider massiv bill fund feder agenc next sept trump administr said will submit congress third round disast aid midnovemb total ten billion dollar congress alreadi provid billion current republicancontrol congress main focus pass major tax reform bill year’ end disast aid help recoveri effort texa florida puerto rico us virgin island follow hurrican harvey irma maria august septemb six week maria tore puerto rico island struggl pick piec twothird resid remain without power us territori declar bankruptci earlier year storm plung economi deeper uncertainti governmentwid spend bill also becom magnet measur democrat minor congress want attach includ bipartisan plan temporarili bolster afford care act common known “obamacare” democrat also hint endyear measur opportun forc passag legisl protect deport young “dreamers” peopl brought illeg unit state children"
## [1] "White spaces stripped:"
## [1] "next us disast aid approv like decemb senat cornyn washington reuter next round aid help rebuild puerto rico us state destruct hurrican wildfir unlik consid congress take catch spend bill face dec deadlin passag senat republican said wednesday john cornyn said crush matter take time “ probabl won’t get ” disast aid consider massiv bill fund feder agenc next sept trump administr said will submit congress third round disast aid midnovemb total ten billion dollar congress alreadi provid billion current republicancontrol congress main focus pass major tax reform bill year’ end disast aid help recoveri effort texa florida puerto rico us virgin island follow hurrican harvey irma maria august septemb six week maria tore puerto rico island struggl pick piec twothird resid remain without power us territori declar bankruptci earlier year storm plung economi deeper uncertainti governmentwid spend bill also becom magnet measur democrat minor congress want attach includ bipartisan plan temporarili bolster afford care act common known “obamacare” democrat also hint endyear measur opportun forc passag legisl protect deport young “dreamers” peopl brought illeg unit state children"
## <<DocumentTermMatrix (documents: 44689, terms: 172777)>>
## Non-/sparse entries: 6870522/7714360831
## Sparsity : 100%
## Maximal term length: 908
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs one peopl presid republican said say state trump will year
## 1316 25 5 4 0 10 3 8 6 5 12
## 16684 7 6 8 5 1 0 45 1 3 2
## 29777 25 5 4 0 10 3 8 6 5 12
## 30551 18 4 2 0 27 0 16 0 9 10
## 32150 7 6 8 5 1 0 45 1 3 2
## 33736 8 6 9 6 1 0 47 5 4 3
## 35664 18 4 2 0 27 0 16 0 9 10
## 41651 8 6 9 6 1 0 47 5 4 3
## 44261 14 2 4 0 4 1 10 0 1 5
## 609 14 2 4 0 4 1 10 0 1 5
## [1] 44689 172777
Too much features/terms, let’s reduce it by removing the terms that appear less than 150 times
## [1] 44689 5638
Cool, there are only 7023 features/terms now, less noisy!
Because of it’s fast computation, Naive Bayes is the best machine learning algorithm for cases like this where there is a lot of features/terms!
# Predict unseen/test data
pred_nb <- predict(model_nb, X_test, type = "class")
# Evaluate predictions
confusionMatrix(pred_nb, y_test, positive = "1")## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4133 177
## 1 109 4518
##
## Accuracy : 0.968
## 95% CI : (0.9641, 0.9715)
## No Information Rate : 0.5253
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9359
##
## Mcnemar's Test P-Value : 7.439e-05
##
## Sensitivity : 0.9623
## Specificity : 0.9743
## Pos Pred Value : 0.9764
## Neg Pred Value : 0.9589
## Prevalence : 0.5253
## Detection Rate : 0.5055
## Detection Prevalence : 0.5177
## Balanced Accuracy : 0.9683
##
## 'Positive' Class : 1
##
The generated model is excellent, with an Accuracy of 96% and Sensitivity of 96%. The model has a high Accuracy, high Sensitivity, and high Specificity, indicating that the model is excellent at predicting/detecting both real and fake news
My way of explaining the AUC of ROC score is that it reflects the level of certainty our model has in its predictions. A high AUC of ROC score indicates that the model is very confident and sure of its predictions. As people who use the model, we want the model to be highly confident in its predictions since we rely on it for making decisions. We don’t want a model that isn’t sure or confident about its predictions. This is why the AUC or ROC score is a crucial measure to determine if the model is prepared for practical use or not.
pred_nb_raw <- predict(model_nb, X_test, type = "raw")
plotROC(pred_nb_raw[, 2], y_test) # This function is from my personal packageMagnificent. Why? Because the closer the AUC to 1 the more confident the model is at detecting which news is fake and which is real.
In conclusion, the generated model is truly outstanding, boasting an Accuracy of 96% and Sensitivity of 96%. The model exhibits exceptional accuracy, high sensitivity, and strong specificity, signifying its prowess in effectively identifying both genuine and fabricated news articles.
Furthermore, the impressive AUC score of 0.99 serves as an additional testament to the model’s readiness for practical deployment. The near-perfect AUC score indicates the model’s high confidence in distinguishing between real and fake news. With its remarkable performance across various metrics, the model has proven itself to be well-prepared and capable of reliable utilization.