Brief Summary

This milestone report is part of the project of Week 2 of the Data Science Capstone Project Course of the Data Science Specialization sequence hosted by John Hopkins University.

This report summarizes basic exploratory analyses using the prepared and cleaned data from Week 1 of the Data Science Capstone Project Course. In addition, I examine the findings from these analyses to create preliminary N-gram plots.

Data Preparation/Cleaning

Installing Libraries

#install.packages("knitr")
library(knitr)
library(NLP)
library(tm)
library(stringr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#install.packages("ngram")
library(ngram)
#install.packages("tidyr")
library(tidyr)
#install.packages("tokenizers")
library(tokenizers)
#install.packages("SnowballC")
library(SnowballC)
#install.packages("tidytext")
library(tidytext)

Uploading data text

Read and Assign variables for types of datasets

### Formulating paths
BLGpath <- "en_US.blogs.txt"
NWSpath <- "en_US.news.txt"
TWIpath <- "en_US.twitter.txt"

### Reading blog files
setwd("C:/Users/mds6592/Desktop//final/en_US")
conn <- file(BLGpath, open = "rb")
blogs <- readLines(conn, encoding = "UTF-8", skipNul = TRUE)
blogs <- iconv(blogs, to = "ASCII", sub = "")
close(conn)

### Reading news files
setwd("C:/Users/mds6592/Desktop//final/en_US")
conn <- file(NWSpath, open = "rb")
news <- readLines(conn, encoding = "UTF-8", skipNul = TRUE)
news <- iconv(news, to = "ASCII", sub = "")
close(conn)

### Reading twitter files
setwd("C:/Users/mds6592/Desktop//final/en_US")
conn <- file(TWIpath, open = "rb")
twitter <- readLines(conn, encoding = "UTF-8", skipNul = TRUE)
twitter <- iconv(twitter, to = "ASCII", sub = "")
close(conn)

#### remove connection variable
rm(conn)

Exploratory Analysis

Finding Number of Lines in each dataset

length(blogs)
## [1] 899288
length(news)
## [1] 1010242
length(twitter)
## [1] 2360148

Word Counts of each dataset

BlogsWC <- sum(str_count(blogs, "\\W+")) 
BlogsWC
## [1] 37799774
NewsWC <- sum(str_count(news, "\\W+"))
NewsWC
## [1] 35582408
TwitterWC <- sum(str_count(twitter, "\\W+"))
TwitterWC
## [1] 30406254

Character counts of each dataset

BlogsCharCount <- sum(str_count(blogs))
BlogsCharCount
## [1] 206043906
NewsCharCount <- sum(str_count(news))
NewsCharCount
## [1] 202917604
TwitterCharCount <- sum(str_count(twitter))
TwitterCharCount
## [1] 161961555

Building Corpus

Assigning variables based on type of dataset

Blogs_Sample <- sample(blogs, length(blogs)/100)
News_Sample <- sample(news, length(news)/100)
Twitter_Sample <- sample(twitter, length(twitter)/100)

Total_Sample <- c(Blogs_Sample, News_Sample, Twitter_Sample)

Corpus formation

build_corpus <- function (x = Total_Sample) {
  texts <- VCorpus(VectorSource(x))
  texts <- tm_map(texts, tolower)
  texts <- tm_map(texts, removePunctuation)
  texts <- tm_map(texts, removeNumbers)
  texts <- tm_map(texts, stripWhitespace)
  texts <- tm_map(texts, removeWords, stopwords("english"))
  texts <- tm_map(texts, stemDocument)
  texts <- tm_map(texts, PlainTextDocument)
}

CorpusData <- build_corpus(Total_Sample)

Forming Document Term Matrix

dtm <- DocumentTermMatrix(CorpusData)

Getting the frequencies

freq <- colSums(as.matrix(dtm))

freq_d <- sort(freq, decreasing = TRUE)

freq_d[1:10]
## will  one  get just said like time  can  day year 
## 3310 3070 3037 3025 2994 2993 2629 2495 2284 2199
findFreqTerms(dtm, lowfreq = 100)
##    [1] "abl"          "absolut"      "accept"       "access"      
##    [5] "accord"       "account"      "across"       "act"         
##    [9] "action"       "activ"        "actual"       "add"         
##   [13] "addit"        "address"      "administr"    "adult"       
##   [17] "afternoon"    "age"          "agenc"        "agent"       
##   [21] "ago"          "agre"         "ahead"        "air"         
##   [25] "album"        "allow"        "almost"       "alon"        
##   [29] "along"        "alreadi"      "also"         "although"    
##   [33] "alway"        "amaz"         "america"      "american"    
##   [37] "among"        "amount"       "angel"        "anim"        
##   [41] "announc"      "annual"       "anoth"        "answer"      
##   [45] "anyon"        "anyth"        "anyway"       "apart"       
##   [49] "app"          "appar"        "appear"       "appl"        
##   [53] "appli"        "appreci"      "approv"       "april"       
##   [57] "area"         "arent"        "arm"          "around"      
##   [61] "arrest"       "arriv"        "art"          "articl"      
##   [65] "artist"       "ask"          "ass"          "assist"      
##   [69] "associ"       "attack"       "attempt"      "attend"      
##   [73] "attent"       "attorney"     "author"       "avail"       
##   [77] "averag"       "award"        "away"         "awesom"      
##   [81] "babi"         "back"         "bad"          "bag"         
##   [85] "ball"         "band"         "bank"         "bar"         
##   [89] "base"         "basic"        "battl"        "beach"       
##   [93] "beat"         "beauti"       "becam"        "becom"       
##   [97] "bed"          "beer"         "began"        "begin"       
##  [101] "behind"       "believ"       "benefit"      "best"        
##  [105] "better"       "big"          "biggest"      "bill"        
##  [109] "billion"      "birthday"     "bit"          "bitch"       
##  [113] "black"        "bless"        "block"        "blog"        
##  [117] "blue"         "board"        "bodi"         "book"        
##  [121] "bore"         "born"         "bought"       "bowl"        
##  [125] "box"          "boy"          "brain"        "brand"       
##  [129] "break"        "bring"        "brother"      "brought"     
##  [133] "brown"        "budget"       "build"        "busi"        
##  [137] "buy"          "cake"         "california"   "call"        
##  [141] "came"         "campaign"     "can"          "cant"        
##  [145] "car"          "card"         "care"         "career"      
##  [149] "carri"        "case"         "catch"        "caus"        
##  [153] "celebr"       "center"       "central"      "certain"     
##  [157] "challeng"     "chanc"        "chang"        "charact"     
##  [161] "charg"        "check"        "chees"        "chicago"     
##  [165] "chicken"      "chief"        "child"        "children"    
##  [169] "chocol"       "choic"        "christma"     "church"      
##  [173] "citi"         "claim"        "class"        "clean"       
##  [177] "clear"        "cleveland"    "close"        "cloth"       
##  [181] "club"         "coach"        "coffe"        "cold"        
##  [185] "collect"      "colleg"       "color"        "combin"      
##  [189] "come"         "comment"      "commit"       "communiti"   
##  [193] "compani"      "competit"     "complet"      "comput"      
##  [197] "concern"      "condit"       "confer"       "congrat"     
##  [201] "connect"      "consid"       "consum"       "contact"     
##  [205] "continu"      "control"      "cook"         "cool"        
##  [209] "correct"      "cost"         "couldnt"      "council"     
##  [213] "count"        "counti"       "countri"      "coupl"       
##  [217] "cours"        "court"        "cover"        "crazi"       
##  [221] "cream"        "creat"        "credit"       "cri"         
##  [225] "critic"       "cross"        "crowd"        "cultur"      
##  [229] "cup"          "current"      "custom"       "cut"         
##  [233] "cute"         "dad"          "daili"        "damn"        
##  [237] "danc"         "dark"         "data"         "date"        
##  [241] "daughter"     "david"        "day"          "dead"        
##  [245] "deal"         "dear"         "death"        "decid"       
##  [249] "decis"        "deep"         "defens"       "definit"     
##  [253] "degre"        "democrat"     "depart"       "describ"     
##  [257] "design"       "despit"       "detail"       "develop"     
##  [261] "didnt"        "die"          "differ"       "difficult"   
##  [265] "dinner"       "direct"       "director"     "discov"      
##  [269] "discuss"      "district"     "doctor"       "doesnt"      
##  [273] "dog"          "dollar"       "done"         "dont"        
##  [277] "door"         "doubl"        "draft"        "draw"        
##  [281] "dream"        "dress"        "drink"        "drive"       
##  [285] "drop"         "drug"         "dude"         "due"         
##  [289] "earli"        "earlier"      "earn"         "earth"       
##  [293] "easi"         "east"         "eat"          "econom"      
##  [297] "economi"      "educ"         "effect"       "effort"      
##  [301] "eight"        "either"       "elect"        "els"         
##  [305] "email"        "emerg"        "emot"         "employe"     
##  [309] "encourag"     "end"          "energi"       "enjoy"       
##  [313] "enough"       "enter"        "entir"        "especi"      
##  [317] "etc"          "even"         "event"        "ever"        
##  [321] "everi"        "everybodi"    "everyon"      "everyth"     
##  [325] "exact"        "exampl"       "except"       "excit"       
##  [329] "execut"       "exist"        "expect"       "experi"      
##  [333] "explain"      "express"      "extra"        "eye"         
##  [337] "face"         "facebook"     "fact"         "fail"        
##  [341] "fair"         "fall"         "famili"       "fan"         
##  [345] "far"          "fast"         "father"       "favorit"     
##  [349] "fear"         "featur"       "feder"        "feel"        
##  [353] "feet"         "fell"         "felt"         "field"       
##  [357] "fight"        "figur"        "file"         "fill"        
##  [361] "film"         "final"        "financi"      "find"        
##  [365] "fine"         "finish"       "fire"         "first"       
##  [369] "fit"          "five"         "fix"          "fli"         
##  [373] "floor"        "flower"       "focus"        "follow"      
##  [377] "food"         "footbal"      "forc"         "forget"      
##  [381] "form"         "former"       "forward"      "found"       
##  [385] "four"         "free"         "fresh"        "friday"      
##  [389] "friend"       "front"        "fuck"         "full"        
##  [393] "fun"          "fund"         "funni"        "futur"       
##  [397] "game"         "garden"       "gave"         "general"     
##  [401] "generat"      "get"          "giant"        "gift"        
##  [405] "girl"         "give"         "given"        "glad"        
##  [409] "glass"        "goal"         "god"          "goe"         
##  [413] "gone"         "gonna"        "good"         "googl"       
##  [417] "got"          "gotta"        "govern"       "great"       
##  [421] "green"        "ground"       "group"        "grow"        
##  [425] "guard"        "guess"        "guy"          "haha"        
##  [429] "hair"         "half"         "hall"         "hand"        
##  [433] "handl"        "hang"         "happen"       "happi"       
##  [437] "hard"         "hate"         "havent"       "head"        
##  [441] "health"       "hear"         "heard"        "heart"       
##  [445] "heat"         "held"         "hell"         "help"        
##  [449] "here"         "hes"          "hey"          "high"        
##  [453] "higher"       "hill"         "hire"         "histori"     
##  [457] "hit"          "hold"         "holiday"      "home"        
##  [461] "honor"        "hope"         "hospit"       "host"        
##  [465] "hot"          "hotel"        "hour"         "hous"        
##  [469] "howev"        "huge"         "human"        "hurt"        
##  [473] "husband"      "ice"          "idea"         "ill"         
##  [477] "imag"         "imagin"       "immedi"       "import"      
##  [481] "improv"       "includ"       "increas"      "individu"    
##  [485] "industri"     "inform"       "initi"        "injuri"      
##  [489] "insid"        "inspir"       "instead"      "interest"    
##  [493] "intern"       "internet"     "interview"    "investig"    
##  [497] "invit"        "involv"       "isnt"         "issu"        
##  [501] "item"         "ive"          "jame"         "jersey"      
##  [505] "jesus"        "job"          "john"         "join"        
##  [509] "joke"         "judg"         "juli"         "jump"        
##  [513] "june"         "just"         "keep"         "key"         
##  [517] "kick"         "kid"          "kill"         "kind"        
##  [521] "king"         "kitchen"      "knew"         "know"        
##  [525] "known"        "ladi"         "lake"         "land"        
##  [529] "larg"         "last"         "late"         "later"       
##  [533] "latest"       "laugh"        "law"          "lead"        
##  [537] "leader"       "leagu"        "learn"        "least"       
##  [541] "leav"         "led"          "left"         "legal"       
##  [545] "less"         "let"          "letter"       "level"       
##  [549] "librari"      "lie"          "life"         "light"       
##  [553] "like"         "limit"        "line"         "link"        
##  [557] "list"         "listen"       "littl"        "live"        
##  [561] "local"        "locat"        "lol"          "long"        
##  [565] "longer"       "look"         "lose"         "loss"        
##  [569] "lost"         "lot"          "loui"         "love"        
##  [573] "low"          "lower"        "luck"         "lunch"       
##  [577] "made"         "main"         "major"        "make"        
##  [581] "man"          "manag"        "mani"         "march"       
##  [585] "mark"         "market"       "marriag"      "match"       
##  [589] "matter"       "may"          "mayb"         "mean"        
##  [593] "measur"       "media"        "medic"        "meet"        
##  [597] "member"       "memori"       "men"          "mention"     
##  [601] "messag"       "met"          "michael"      "middl"       
##  [605] "might"        "mike"         "mile"         "million"     
##  [609] "mind"         "mine"         "minut"        "miss"        
##  [613] "mix"          "model"        "mom"          "moment"      
##  [617] "monday"       "money"        "month"        "morn"        
##  [621] "most"         "mother"       "move"         "movi"        
##  [625] "much"         "music"        "must"         "name"        
##  [629] "nation"       "natur"        "near"         "need"        
##  [633] "never"        "new"          "news"         "next"        
##  [637] "nice"         "night"        "normal"       "north"       
##  [641] "note"         "noth"         "notic"        "now"         
##  [645] "number"       "obama"        "obvious"      "offer"       
##  [649] "offic"        "offici"       "often"        "ohio"        
##  [653] "oil"          "okay"         "old"          "one"         
##  [657] "onlin"        "open"         "oper"         "opportun"    
##  [661] "option"       "order"        "oregon"       "organ"       
##  [665] "origin"       "other"        "outsid"       "owner"       
##  [669] "pack"         "page"         "paid"         "pain"        
##  [673] "paint"        "pair"         "paper"        "parent"      
##  [677] "park"         "part"         "parti"        "particip"    
##  [681] "particular"   "pass"         "past"         "paul"        
##  [685] "pay"          "peac"         "peopl"        "per"         
##  [689] "percent"      "perfect"      "perform"      "perhap"      
##  [693] "period"       "person"       "phone"        "photo"       
##  [697] "pic"          "pick"         "pictur"       "piec"        
##  [701] "pitch"        "place"        "plan"         "plant"       
##  [705] "play"         "player"       "pleas"        "plus"        
##  [709] "point"        "polic"        "polici"       "polit"       
##  [713] "poor"         "portland"     "posit"        "possibl"     
##  [717] "post"         "potenti"      "power"        "practic"     
##  [721] "prepar"       "present"      "presid"       "press"       
##  [725] "pretti"       "previous"     "price"        "prison"      
##  [729] "privat"       "probabl"      "problem"      "process"     
##  [733] "produc"       "product"      "program"      "project"     
##  [737] "promis"       "properti"     "propos"       "protect"     
##  [741] "proud"        "provid"       "public"       "publish"     
##  [745] "pull"         "purchas"      "push"         "put"         
##  [749] "qualiti"      "quarter"      "question"     "quick"       
##  [753] "quit"         "race"         "radio"        "rain"        
##  [757] "rais"         "ran"          "rate"         "rather"      
##  [761] "reach"        "read"         "readi"        "real"        
##  [765] "realiz"       "realli"       "reason"       "receiv"      
##  [769] "recent"       "recommend"    "record"       "red"         
##  [773] "refer"        "region"       "regular"      "relat"       
##  [777] "relationship" "releas"       "remain"       "rememb"      
##  [781] "remind"       "remov"        "replac"       "report"      
##  [785] "repres"       "republican"   "requir"       "research"    
##  [789] "resid"        "respect"      "respons"      "rest"        
##  [793] "restaur"      "result"       "retir"        "return"      
##  [797] "review"       "ride"         "right"        "rise"        
##  [801] "risk"         "river"        "road"         "robert"      
##  [805] "rock"         "role"         "roll"         "romney"      
##  [809] "room"         "rose"         "round"        "rule"        
##  [813] "run"          "sad"          "safe"         "said"        
##  [817] "sale"         "san"          "saturday"     "save"        
##  [821] "saw"          "say"          "scene"        "schedul"     
##  [825] "school"       "score"        "search"       "season"      
##  [829] "seat"         "second"       "secret"       "secur"       
##  [833] "see"          "seem"         "seen"         "select"      
##  [837] "sell"         "senat"        "send"         "senior"      
##  [841] "sens"         "sent"         "seri"         "serious"     
##  [845] "serv"         "servic"       "session"      "set"         
##  [849] "seven"        "sever"        "share"        "shes"        
##  [853] "shit"         "shoe"         "shoot"        "shop"        
##  [857] "short"        "shot"         "show"         "sick"        
##  [861] "side"         "sign"         "similar"      "simpl"       
##  [865] "simpli"       "sinc"         "sing"         "singl"       
##  [869] "sister"       "sit"          "site"         "situat"      
##  [873] "six"          "size"         "sleep"        "slow"        
##  [877] "small"        "smile"        "smith"        "smoke"       
##  [881] "social"       "someon"       "someth"       "sometim"     
##  [885] "son"          "song"         "soon"         "sorri"       
##  [889] "sort"         "sound"        "sourc"        "south"       
##  [893] "space"        "speak"        "special"      "specif"      
##  [897] "spend"        "spent"        "spirit"       "sport"       
##  [901] "spot"         "spring"       "squar"        "staff"       
##  [905] "stage"        "stand"        "standard"     "star"        
##  [909] "start"        "state"        "statement"    "station"     
##  [913] "stay"         "step"         "stick"        "still"       
##  [917] "stock"        "stop"         "store"        "stori"       
##  [921] "straight"     "street"       "strong"       "struggl"     
##  [925] "student"      "studi"        "stuff"        "style"       
##  [929] "subject"      "success"      "suck"         "suffer"      
##  [933] "suggest"      "summer"       "sun"          "sunday"      
##  [937] "super"        "support"      "suppos"       "sure"        
##  [941] "surpris"      "sweet"        "system"       "tabl"        
##  [945] "take"         "taken"        "talent"       "talk"        
##  [949] "tast"         "tax"          "teach"        "teacher"     
##  [953] "team"         "technolog"    "tell"         "term"        
##  [957] "test"         "text"         "thank"        "that"        
##  [961] "there"        "theyr"        "thing"        "think"       
##  [965] "third"        "though"       "thought"      "three"       
##  [969] "throw"        "thursday"     "ticket"       "tie"         
##  [973] "time"         "tire"         "titl"         "today"       
##  [977] "togeth"       "told"         "tomorrow"     "tonight"     
##  [981] "took"         "top"          "total"        "touch"       
##  [985] "tough"        "tour"         "toward"       "town"        
##  [989] "track"        "trade"        "tradit"       "train"       
##  [993] "travel"       "treat"        "tree"         "tri"         
##  [997] "trip"         "true"         "truli"        "trust"       
## [1001] "truth"        "tuesday"      "turn"         "tweet"       
## [1005] "twitter"      "two"          "type"         "understand"  
## [1009] "union"        "unit"         "univers"      "updat"       
## [1013] "upon"         "use"          "usual"        "valu"        
## [1017] "version"      "via"          "video"        "view"        
## [1021] "visit"        "voic"         "vote"         "wait"        
## [1025] "walk"         "wall"         "wanna"        "want"        
## [1029] "war"          "warm"         "washington"   "wasnt"       
## [1033] "watch"        "water"        "way"          "wear"        
## [1037] "weather"      "websit"       "wed"          "wednesday"   
## [1041] "week"         "weekend"      "welcom"       "well"        
## [1045] "went"         "west"         "weve"         "what"        
## [1049] "whether"      "white"        "whole"        "whose"       
## [1053] "wife"         "will"         "william"      "win"         
## [1057] "wine"         "winner"       "wish"         "wit"         
## [1061] "within"       "without"      "woman"        "women"       
## [1065] "won"          "wonder"       "wont"         "word"        
## [1069] "work"         "worker"       "world"        "worri"       
## [1073] "worth"        "wouldnt"      "wow"          "write"       
## [1077] "writer"       "wrong"        "wrote"        "yall"        
## [1081] "yard"         "yeah"         "year"         "yearold"     
## [1085] "yes"          "yesterday"    "yet"          "york"        
## [1089] "youll"        "young"        "your"         "youv"

Creating Preliminary Plots

Barplots

#install.packages("RColorBrewer")
library(RColorBrewer)
cols <- brewer.pal(8, "Spectral")
barplot(freq_d[1:20], main = "Barplot of Top 20 Terms", xlab = "Term", ylab = "Frequency", col = cols, las = 2, cex.names = 1)

WordClouds

#install.packages("wordcloud")
library(wordcloud)
matrix <- as.matrix(dtm)
words <- sort(colSums(matrix), decreasing = TRUE)
df <- data.frame(word=names(words), freq = words)

wordcloud(words=df$word, freq = df$freq, max.words=100, random.order = FALSE, col=cols, scale = c(1.5,1.5))

Future Directions

Moving forward, to create an adequate predictive algorithm, more cleaning is necessary regarding misspelled words. The next steps include creating n-grams to accurately predict the next words inthe text.