This milestone report is part of the project of Week 2 of the Data Science Capstone Project Course of the Data Science Specialization sequence hosted by John Hopkins University.
This report summarizes basic exploratory analyses using the prepared and cleaned data from Week 1 of the Data Science Capstone Project Course. In addition, I examine the findings from these analyses to create preliminary N-gram plots.
#install.packages("knitr")
library(knitr)
library(NLP)
library(tm)
library(stringr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#install.packages("ngram")
library(ngram)
#install.packages("tidyr")
library(tidyr)
#install.packages("tokenizers")
library(tokenizers)
#install.packages("SnowballC")
library(SnowballC)
#install.packages("tidytext")
library(tidytext)
### Formulating paths
BLGpath <- "en_US.blogs.txt"
NWSpath <- "en_US.news.txt"
TWIpath <- "en_US.twitter.txt"
### Reading blog files
setwd("C:/Users/mds6592/Desktop//final/en_US")
conn <- file(BLGpath, open = "rb")
blogs <- readLines(conn, encoding = "UTF-8", skipNul = TRUE)
blogs <- iconv(blogs, to = "ASCII", sub = "")
close(conn)
### Reading news files
setwd("C:/Users/mds6592/Desktop//final/en_US")
conn <- file(NWSpath, open = "rb")
news <- readLines(conn, encoding = "UTF-8", skipNul = TRUE)
news <- iconv(news, to = "ASCII", sub = "")
close(conn)
### Reading twitter files
setwd("C:/Users/mds6592/Desktop//final/en_US")
conn <- file(TWIpath, open = "rb")
twitter <- readLines(conn, encoding = "UTF-8", skipNul = TRUE)
twitter <- iconv(twitter, to = "ASCII", sub = "")
close(conn)
#### remove connection variable
rm(conn)
length(blogs)
## [1] 899288
length(news)
## [1] 1010242
length(twitter)
## [1] 2360148
BlogsWC <- sum(str_count(blogs, "\\W+"))
BlogsWC
## [1] 37799774
NewsWC <- sum(str_count(news, "\\W+"))
NewsWC
## [1] 35582408
TwitterWC <- sum(str_count(twitter, "\\W+"))
TwitterWC
## [1] 30406254
BlogsCharCount <- sum(str_count(blogs))
BlogsCharCount
## [1] 206043906
NewsCharCount <- sum(str_count(news))
NewsCharCount
## [1] 202917604
TwitterCharCount <- sum(str_count(twitter))
TwitterCharCount
## [1] 161961555
Blogs_Sample <- sample(blogs, length(blogs)/100)
News_Sample <- sample(news, length(news)/100)
Twitter_Sample <- sample(twitter, length(twitter)/100)
Total_Sample <- c(Blogs_Sample, News_Sample, Twitter_Sample)
build_corpus <- function (x = Total_Sample) {
texts <- VCorpus(VectorSource(x))
texts <- tm_map(texts, tolower)
texts <- tm_map(texts, removePunctuation)
texts <- tm_map(texts, removeNumbers)
texts <- tm_map(texts, stripWhitespace)
texts <- tm_map(texts, removeWords, stopwords("english"))
texts <- tm_map(texts, stemDocument)
texts <- tm_map(texts, PlainTextDocument)
}
CorpusData <- build_corpus(Total_Sample)
dtm <- DocumentTermMatrix(CorpusData)
freq <- colSums(as.matrix(dtm))
freq_d <- sort(freq, decreasing = TRUE)
freq_d[1:10]
## will one get just said like time can day year
## 3310 3070 3037 3025 2994 2993 2629 2495 2284 2199
findFreqTerms(dtm, lowfreq = 100)
## [1] "abl" "absolut" "accept" "access"
## [5] "accord" "account" "across" "act"
## [9] "action" "activ" "actual" "add"
## [13] "addit" "address" "administr" "adult"
## [17] "afternoon" "age" "agenc" "agent"
## [21] "ago" "agre" "ahead" "air"
## [25] "album" "allow" "almost" "alon"
## [29] "along" "alreadi" "also" "although"
## [33] "alway" "amaz" "america" "american"
## [37] "among" "amount" "angel" "anim"
## [41] "announc" "annual" "anoth" "answer"
## [45] "anyon" "anyth" "anyway" "apart"
## [49] "app" "appar" "appear" "appl"
## [53] "appli" "appreci" "approv" "april"
## [57] "area" "arent" "arm" "around"
## [61] "arrest" "arriv" "art" "articl"
## [65] "artist" "ask" "ass" "assist"
## [69] "associ" "attack" "attempt" "attend"
## [73] "attent" "attorney" "author" "avail"
## [77] "averag" "award" "away" "awesom"
## [81] "babi" "back" "bad" "bag"
## [85] "ball" "band" "bank" "bar"
## [89] "base" "basic" "battl" "beach"
## [93] "beat" "beauti" "becam" "becom"
## [97] "bed" "beer" "began" "begin"
## [101] "behind" "believ" "benefit" "best"
## [105] "better" "big" "biggest" "bill"
## [109] "billion" "birthday" "bit" "bitch"
## [113] "black" "bless" "block" "blog"
## [117] "blue" "board" "bodi" "book"
## [121] "bore" "born" "bought" "bowl"
## [125] "box" "boy" "brain" "brand"
## [129] "break" "bring" "brother" "brought"
## [133] "brown" "budget" "build" "busi"
## [137] "buy" "cake" "california" "call"
## [141] "came" "campaign" "can" "cant"
## [145] "car" "card" "care" "career"
## [149] "carri" "case" "catch" "caus"
## [153] "celebr" "center" "central" "certain"
## [157] "challeng" "chanc" "chang" "charact"
## [161] "charg" "check" "chees" "chicago"
## [165] "chicken" "chief" "child" "children"
## [169] "chocol" "choic" "christma" "church"
## [173] "citi" "claim" "class" "clean"
## [177] "clear" "cleveland" "close" "cloth"
## [181] "club" "coach" "coffe" "cold"
## [185] "collect" "colleg" "color" "combin"
## [189] "come" "comment" "commit" "communiti"
## [193] "compani" "competit" "complet" "comput"
## [197] "concern" "condit" "confer" "congrat"
## [201] "connect" "consid" "consum" "contact"
## [205] "continu" "control" "cook" "cool"
## [209] "correct" "cost" "couldnt" "council"
## [213] "count" "counti" "countri" "coupl"
## [217] "cours" "court" "cover" "crazi"
## [221] "cream" "creat" "credit" "cri"
## [225] "critic" "cross" "crowd" "cultur"
## [229] "cup" "current" "custom" "cut"
## [233] "cute" "dad" "daili" "damn"
## [237] "danc" "dark" "data" "date"
## [241] "daughter" "david" "day" "dead"
## [245] "deal" "dear" "death" "decid"
## [249] "decis" "deep" "defens" "definit"
## [253] "degre" "democrat" "depart" "describ"
## [257] "design" "despit" "detail" "develop"
## [261] "didnt" "die" "differ" "difficult"
## [265] "dinner" "direct" "director" "discov"
## [269] "discuss" "district" "doctor" "doesnt"
## [273] "dog" "dollar" "done" "dont"
## [277] "door" "doubl" "draft" "draw"
## [281] "dream" "dress" "drink" "drive"
## [285] "drop" "drug" "dude" "due"
## [289] "earli" "earlier" "earn" "earth"
## [293] "easi" "east" "eat" "econom"
## [297] "economi" "educ" "effect" "effort"
## [301] "eight" "either" "elect" "els"
## [305] "email" "emerg" "emot" "employe"
## [309] "encourag" "end" "energi" "enjoy"
## [313] "enough" "enter" "entir" "especi"
## [317] "etc" "even" "event" "ever"
## [321] "everi" "everybodi" "everyon" "everyth"
## [325] "exact" "exampl" "except" "excit"
## [329] "execut" "exist" "expect" "experi"
## [333] "explain" "express" "extra" "eye"
## [337] "face" "facebook" "fact" "fail"
## [341] "fair" "fall" "famili" "fan"
## [345] "far" "fast" "father" "favorit"
## [349] "fear" "featur" "feder" "feel"
## [353] "feet" "fell" "felt" "field"
## [357] "fight" "figur" "file" "fill"
## [361] "film" "final" "financi" "find"
## [365] "fine" "finish" "fire" "first"
## [369] "fit" "five" "fix" "fli"
## [373] "floor" "flower" "focus" "follow"
## [377] "food" "footbal" "forc" "forget"
## [381] "form" "former" "forward" "found"
## [385] "four" "free" "fresh" "friday"
## [389] "friend" "front" "fuck" "full"
## [393] "fun" "fund" "funni" "futur"
## [397] "game" "garden" "gave" "general"
## [401] "generat" "get" "giant" "gift"
## [405] "girl" "give" "given" "glad"
## [409] "glass" "goal" "god" "goe"
## [413] "gone" "gonna" "good" "googl"
## [417] "got" "gotta" "govern" "great"
## [421] "green" "ground" "group" "grow"
## [425] "guard" "guess" "guy" "haha"
## [429] "hair" "half" "hall" "hand"
## [433] "handl" "hang" "happen" "happi"
## [437] "hard" "hate" "havent" "head"
## [441] "health" "hear" "heard" "heart"
## [445] "heat" "held" "hell" "help"
## [449] "here" "hes" "hey" "high"
## [453] "higher" "hill" "hire" "histori"
## [457] "hit" "hold" "holiday" "home"
## [461] "honor" "hope" "hospit" "host"
## [465] "hot" "hotel" "hour" "hous"
## [469] "howev" "huge" "human" "hurt"
## [473] "husband" "ice" "idea" "ill"
## [477] "imag" "imagin" "immedi" "import"
## [481] "improv" "includ" "increas" "individu"
## [485] "industri" "inform" "initi" "injuri"
## [489] "insid" "inspir" "instead" "interest"
## [493] "intern" "internet" "interview" "investig"
## [497] "invit" "involv" "isnt" "issu"
## [501] "item" "ive" "jame" "jersey"
## [505] "jesus" "job" "john" "join"
## [509] "joke" "judg" "juli" "jump"
## [513] "june" "just" "keep" "key"
## [517] "kick" "kid" "kill" "kind"
## [521] "king" "kitchen" "knew" "know"
## [525] "known" "ladi" "lake" "land"
## [529] "larg" "last" "late" "later"
## [533] "latest" "laugh" "law" "lead"
## [537] "leader" "leagu" "learn" "least"
## [541] "leav" "led" "left" "legal"
## [545] "less" "let" "letter" "level"
## [549] "librari" "lie" "life" "light"
## [553] "like" "limit" "line" "link"
## [557] "list" "listen" "littl" "live"
## [561] "local" "locat" "lol" "long"
## [565] "longer" "look" "lose" "loss"
## [569] "lost" "lot" "loui" "love"
## [573] "low" "lower" "luck" "lunch"
## [577] "made" "main" "major" "make"
## [581] "man" "manag" "mani" "march"
## [585] "mark" "market" "marriag" "match"
## [589] "matter" "may" "mayb" "mean"
## [593] "measur" "media" "medic" "meet"
## [597] "member" "memori" "men" "mention"
## [601] "messag" "met" "michael" "middl"
## [605] "might" "mike" "mile" "million"
## [609] "mind" "mine" "minut" "miss"
## [613] "mix" "model" "mom" "moment"
## [617] "monday" "money" "month" "morn"
## [621] "most" "mother" "move" "movi"
## [625] "much" "music" "must" "name"
## [629] "nation" "natur" "near" "need"
## [633] "never" "new" "news" "next"
## [637] "nice" "night" "normal" "north"
## [641] "note" "noth" "notic" "now"
## [645] "number" "obama" "obvious" "offer"
## [649] "offic" "offici" "often" "ohio"
## [653] "oil" "okay" "old" "one"
## [657] "onlin" "open" "oper" "opportun"
## [661] "option" "order" "oregon" "organ"
## [665] "origin" "other" "outsid" "owner"
## [669] "pack" "page" "paid" "pain"
## [673] "paint" "pair" "paper" "parent"
## [677] "park" "part" "parti" "particip"
## [681] "particular" "pass" "past" "paul"
## [685] "pay" "peac" "peopl" "per"
## [689] "percent" "perfect" "perform" "perhap"
## [693] "period" "person" "phone" "photo"
## [697] "pic" "pick" "pictur" "piec"
## [701] "pitch" "place" "plan" "plant"
## [705] "play" "player" "pleas" "plus"
## [709] "point" "polic" "polici" "polit"
## [713] "poor" "portland" "posit" "possibl"
## [717] "post" "potenti" "power" "practic"
## [721] "prepar" "present" "presid" "press"
## [725] "pretti" "previous" "price" "prison"
## [729] "privat" "probabl" "problem" "process"
## [733] "produc" "product" "program" "project"
## [737] "promis" "properti" "propos" "protect"
## [741] "proud" "provid" "public" "publish"
## [745] "pull" "purchas" "push" "put"
## [749] "qualiti" "quarter" "question" "quick"
## [753] "quit" "race" "radio" "rain"
## [757] "rais" "ran" "rate" "rather"
## [761] "reach" "read" "readi" "real"
## [765] "realiz" "realli" "reason" "receiv"
## [769] "recent" "recommend" "record" "red"
## [773] "refer" "region" "regular" "relat"
## [777] "relationship" "releas" "remain" "rememb"
## [781] "remind" "remov" "replac" "report"
## [785] "repres" "republican" "requir" "research"
## [789] "resid" "respect" "respons" "rest"
## [793] "restaur" "result" "retir" "return"
## [797] "review" "ride" "right" "rise"
## [801] "risk" "river" "road" "robert"
## [805] "rock" "role" "roll" "romney"
## [809] "room" "rose" "round" "rule"
## [813] "run" "sad" "safe" "said"
## [817] "sale" "san" "saturday" "save"
## [821] "saw" "say" "scene" "schedul"
## [825] "school" "score" "search" "season"
## [829] "seat" "second" "secret" "secur"
## [833] "see" "seem" "seen" "select"
## [837] "sell" "senat" "send" "senior"
## [841] "sens" "sent" "seri" "serious"
## [845] "serv" "servic" "session" "set"
## [849] "seven" "sever" "share" "shes"
## [853] "shit" "shoe" "shoot" "shop"
## [857] "short" "shot" "show" "sick"
## [861] "side" "sign" "similar" "simpl"
## [865] "simpli" "sinc" "sing" "singl"
## [869] "sister" "sit" "site" "situat"
## [873] "six" "size" "sleep" "slow"
## [877] "small" "smile" "smith" "smoke"
## [881] "social" "someon" "someth" "sometim"
## [885] "son" "song" "soon" "sorri"
## [889] "sort" "sound" "sourc" "south"
## [893] "space" "speak" "special" "specif"
## [897] "spend" "spent" "spirit" "sport"
## [901] "spot" "spring" "squar" "staff"
## [905] "stage" "stand" "standard" "star"
## [909] "start" "state" "statement" "station"
## [913] "stay" "step" "stick" "still"
## [917] "stock" "stop" "store" "stori"
## [921] "straight" "street" "strong" "struggl"
## [925] "student" "studi" "stuff" "style"
## [929] "subject" "success" "suck" "suffer"
## [933] "suggest" "summer" "sun" "sunday"
## [937] "super" "support" "suppos" "sure"
## [941] "surpris" "sweet" "system" "tabl"
## [945] "take" "taken" "talent" "talk"
## [949] "tast" "tax" "teach" "teacher"
## [953] "team" "technolog" "tell" "term"
## [957] "test" "text" "thank" "that"
## [961] "there" "theyr" "thing" "think"
## [965] "third" "though" "thought" "three"
## [969] "throw" "thursday" "ticket" "tie"
## [973] "time" "tire" "titl" "today"
## [977] "togeth" "told" "tomorrow" "tonight"
## [981] "took" "top" "total" "touch"
## [985] "tough" "tour" "toward" "town"
## [989] "track" "trade" "tradit" "train"
## [993] "travel" "treat" "tree" "tri"
## [997] "trip" "true" "truli" "trust"
## [1001] "truth" "tuesday" "turn" "tweet"
## [1005] "twitter" "two" "type" "understand"
## [1009] "union" "unit" "univers" "updat"
## [1013] "upon" "use" "usual" "valu"
## [1017] "version" "via" "video" "view"
## [1021] "visit" "voic" "vote" "wait"
## [1025] "walk" "wall" "wanna" "want"
## [1029] "war" "warm" "washington" "wasnt"
## [1033] "watch" "water" "way" "wear"
## [1037] "weather" "websit" "wed" "wednesday"
## [1041] "week" "weekend" "welcom" "well"
## [1045] "went" "west" "weve" "what"
## [1049] "whether" "white" "whole" "whose"
## [1053] "wife" "will" "william" "win"
## [1057] "wine" "winner" "wish" "wit"
## [1061] "within" "without" "woman" "women"
## [1065] "won" "wonder" "wont" "word"
## [1069] "work" "worker" "world" "worri"
## [1073] "worth" "wouldnt" "wow" "write"
## [1077] "writer" "wrong" "wrote" "yall"
## [1081] "yard" "yeah" "year" "yearold"
## [1085] "yes" "yesterday" "yet" "york"
## [1089] "youll" "young" "your" "youv"
#install.packages("RColorBrewer")
library(RColorBrewer)
cols <- brewer.pal(8, "Spectral")
barplot(freq_d[1:20], main = "Barplot of Top 20 Terms", xlab = "Term", ylab = "Frequency", col = cols, las = 2, cex.names = 1)
#install.packages("wordcloud")
library(wordcloud)
matrix <- as.matrix(dtm)
words <- sort(colSums(matrix), decreasing = TRUE)
df <- data.frame(word=names(words), freq = words)
wordcloud(words=df$word, freq = df$freq, max.words=100, random.order = FALSE, col=cols, scale = c(1.5,1.5))
Moving forward, to create an adequate predictive algorithm, more cleaning is necessary regarding misspelled words. The next steps include creating n-grams to accurately predict the next words inthe text.