Introduction

This markdown document shows:
- Duplicate tweet identification
- Upload of non-duplicates to MongoDB database (local instance)

The tweed id (“id” variable) was set as primary key in the tweets table in the tempdb MongoDB database.

The “cleanTweets” data frame is used from the process demonstrated in http://www.rpubs.com/johannraath/twitter_scrape_sentiment
For the sake of this markdown document it was imported from CSV.

Process

Connect to MongoDB

Connect to MongoDB “testdb” and table “tweets”.

library(mongolite)
tweetTable <- mongo(collection = "tweets", db = "testdb")
tweetTable$count(query = "{}")
## [1] 7033

Find duplicates

  1. Obtain list of tweet ids from the data frame
  2. Obtain list of tweet ids from the MongoDB database
  3. Compile list of duplicates
  4. Create new data frame that does not contain the duplicate tweets
cleanTweet <- read.csv("C:/R/Datasets/cleanTweets.csv")
tweetList <- as.array(cleanTweet$id)
tweetList <- as.character(tweetList)
head(tweetList)
## [1] "957931715250544640" "957931714994679808" "957931714826919936"
## [4] "957931714646482944" "957931714621386752" "957931713375735808"
DBtweetIDs <- tweetTable$find(query = '{}', fields = '{"id": 1}')
head(DBtweetIDs)
##                        _id                 id
## 1 5a5c97e64a23710758000a22 952847407322140672
## 2 5a5c97e64a23710758000a23 952845976632512512
## 3 5a5c97e64a23710758000a24 952840007852744710
## 4 5a5c97e64a23710758000a25 952831356349894656
## 5 5a5c97e64a23710758000a26 952827351959834624
## 6 5a5c97e64a23710758000a27 952824790926155776
duplicates <- DBtweetIDs$id[DBtweetIDs$id %in% tweetList]
duplicateRows <- which(cleanTweet$id %in% duplicates)
length(duplicateRows)
## [1] 300
if (length(duplicateRows) > 0) {
    cleanTweet2 <- cleanTweet[-duplicateRows,]
    } else {
        cleanTweet2 <- cleanTweet
    }

# VERY IMPORTANT! Make sure your key is in the right format, or else duplicates will be incorrectly identified by the database insert, and the insert will fail.
cleanTweet2$id <- as.character(cleanTweet2$id)

head(cleanTweet2)
##   X
## 1 1
## 2 2
## 3 3
## 4 4
## 5 5
## 6 6
##                                                                                                                                         text
## 1 RT @HispanicsTrump: Just found out the Grammys are on tonight...-I'd rather eat a Tide Pod than watch that left wing circus event-#Grammys
## 2                   RT @MarkDice: This Cardi B character makes music for the CIA to use in their torture of terror suspects, right? #GRAMMYs
## 3                                                          RT @TheShadyFacts: LOONA members preparing to walk onto 2018 #GRAMMYsRed Carpet! 
## 4                            RT @LookDifferent: .@Camila_Cabello is going off on #dreamers at the #grammys and i'm cryingggggg. #dreamactnow
## 5                                                                        RT @RihannaDaily: New photos of Rihanna backstage at the #GRAMMYs. 
## 6 RT @DrMartyFox: The #Grammys Honors Female Victims Of #MeToo Sexual Abuse -By Having RAPE ENABLER Hillary Who Terrorized Female Victims Of
##                contentURL favorited favoriteCount             created
## 1                    <NA>     FALSE             0 2018-01-29 11:01:33
## 2                    <NA>     FALSE             0 2018-01-29 11:01:33
## 3 https://t.co/GxtkLf64Gf     FALSE             0 2018-01-29 11:01:32
## 4                    <NA>     FALSE             0 2018-01-29 11:01:32
## 5 https://t.co/kWiF58LaMr     FALSE             0 2018-01-29 11:01:32
## 6                    <NA>     FALSE             0 2018-01-29 11:01:32
##   truncated                 id    screenName retweetCount isRetweet
## 1     FALSE 957931715250544640   MaryHar5117          106      TRUE
## 2     FALSE 957931714994679808       ricuman          179      TRUE
## 3     FALSE 957931714826919936     homomuses         1415      TRUE
## 4     FALSE 957931714646482944 _camilizer97_          158      TRUE
## 5     FALSE 957931714621386752      janvdyne         3173      TRUE
## 6     FALSE 957931713375735808   SandyGail22           20      TRUE
##   retweeted sentimentScore
## 1     FALSE              0
## 2     FALSE             -2
## 3     FALSE              0
## 4     FALSE              0
## 5     FALSE              0
## 6     FALSE             -4

Upload tweets to MongoDB database

if (dim(cleanTweet2)[1] > 0) {
        tweetTable$insert(cleanTweet2, stop_on_error = FALSE)
}
## List of 5
##  $ nInserted  : num 700
##  $ nMatched   : num 0
##  $ nRemoved   : num 0
##  $ nUpserted  : num 0
##  $ writeErrors: list()

Check number of database records

tweetTable$count(query = "{}")
## [1] 7733