The goal of this project is just to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm. Please submit a report on R Pubs (http://rpubs.com/) that explains your exploratory analysis and your goals for the eventual app and algorithm. This document should be concise and explain only the major features of the data you have identified and briefly summarize your plans for creating the prediction algorithm and Shiny app in a way that would be understandable to a non-data scientist manager. You should make use of tables and plots to illustrate important summaries of the data set. The motivation for this project is to: 1. Demonstrate that you’ve downloaded the data and have successfully loaded it in.2. Create a basic report of summary statistics about the data sets.3. Report any interesting findings that you amassed so far.4. Get feedback on your plans for creating a prediction algorithm and Shiny app.
setwd('/Users/emmasun/Desktop/Capstone/')
library(ggplot2)
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(SnowballC)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(doParallel)
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
library(stringi)
knitr::opts_chunk$set(echo = TRUE)
blogs <- readLines("/Users/emmasun/Desktop/Capstone/en_US.blogs.txt", encoding = "UTF-8")
twitter <- readLines("/Users/emmasun/Desktop/Capstone/en_US.twitter.txt", encoding = "UTF-8")
## Warning in readLines("/Users/emmasun/Desktop/Capstone/en_US.twitter.txt", :
## line 167155 appears to contain an embedded nul
## Warning in readLines("/Users/emmasun/Desktop/Capstone/en_US.twitter.txt", :
## line 268547 appears to contain an embedded nul
## Warning in readLines("/Users/emmasun/Desktop/Capstone/en_US.twitter.txt", :
## line 1274086 appears to contain an embedded nul
## Warning in readLines("/Users/emmasun/Desktop/Capstone/en_US.twitter.txt", :
## line 1759032 appears to contain an embedded nul
##### use binary instead ####
con <- file("/Users/emmasun/Desktop/Capstone/en_US.news.txt", open="rb")
news <- readLines(con, encoding="UTF-8")
close(con)
rm(con)
#info of blogs file(size)
file.info("/Users/emmasun/Desktop/Capstone/en_US.blogs.txt")
## size isdir mode
## /Users/emmasun/Desktop/Capstone/en_US.blogs.txt 210160014 FALSE 644
## mtime
## /Users/emmasun/Desktop/Capstone/en_US.blogs.txt 2014-07-22 10:13:06
## ctime
## /Users/emmasun/Desktop/Capstone/en_US.blogs.txt 2017-02-20 21:05:27
## atime uid
## /Users/emmasun/Desktop/Capstone/en_US.blogs.txt 2017-02-21 20:23:25 501
## gid uname grname
## /Users/emmasun/Desktop/Capstone/en_US.blogs.txt 20 emmasun staff
head(blogs, 5)
## [1] "In the years thereafter, most of the Oil fields and platforms were named after pagan “gods”."
## [2] "We love you Mr. Brown."
## [3] "Chad has been awesome with the kids and holding down the fort while I work later than usual! The kids have been busy together playing Skylander on the XBox together, after Kyan cashed in his $$$ from his piggy bank. He wanted that game so bad and used his gift card from his birthday he has been saving and the money to get it (he never taps into that thing either, that is how we know he wanted it so bad). We made him count all of his money to make sure that he had enough! It was very cute to watch his reaction when he realized he did! He also does a very good job of letting Lola feel like she is playing too, by letting her switch out the characters! She loves it almost as much as him."
## [4] "so anyways, i am going to share some home decor inspiration that i have been storing in my folder on the puter. i have all these amazing images stored away ready to come to life when we get our home."
## [5] "With graduation season right around the corner, Nancy has whipped up a fun set to help you out with not only your graduation cards and gifts, but any occasion that brings on a change in one's life. I stamped the images in Memento Tuxedo Black and cut them out with circle Nestabilities. I embossed the kraft and red cardstock with TE's new Stars Impressions Plate, which is double sided and gives you 2 fantastic patterns. You can see how to use the Impressions Plates in this tutorial Taylor created. Just one pass through your die cut machine using the Embossing Pad Kit is all you need to do - super easy!"
#info of twitter file
file.info("/Users/emmasun/Desktop/Capstone/en_US.twitter.txt")
## size isdir mode
## /Users/emmasun/Desktop/Capstone/en_US.twitter.txt 167105338 FALSE 644
## mtime
## /Users/emmasun/Desktop/Capstone/en_US.twitter.txt 2014-07-22 10:12:58
## ctime
## /Users/emmasun/Desktop/Capstone/en_US.twitter.txt 2017-02-20 21:05:28
## atime uid
## /Users/emmasun/Desktop/Capstone/en_US.twitter.txt 2017-02-21 20:23:47 501
## gid uname grname
## /Users/emmasun/Desktop/Capstone/en_US.twitter.txt 20 emmasun staff
head(twitter, 5)
## [1] "How are you? Btw thanks for the RT. You gonna be in DC anytime soon? Love to see you. Been way, way too long."
## [2] "When you meet someone special... you'll know. Your heart will beat more rapidly and you'll smile for no reason."
## [3] "they've decided its more fun if I don't."
## [4] "So Tired D; Played Lazer Tag & Ran A LOT D; Ughh Going To Sleep Like In 5 Minutes ;)"
## [5] "Words from a complete stranger! Made my birthday even better :)"
#info of news file
file.info("/Users/emmasun/Desktop/Capstone/en_US.news.txt")
## size isdir mode
## /Users/emmasun/Desktop/Capstone/en_US.news.txt 205811889 FALSE 644
## mtime
## /Users/emmasun/Desktop/Capstone/en_US.news.txt 2014-07-22 10:13:04
## ctime
## /Users/emmasun/Desktop/Capstone/en_US.news.txt 2017-02-20 21:05:27
## atime uid gid
## /Users/emmasun/Desktop/Capstone/en_US.news.txt 2017-02-21 20:24:12 501 20
## uname grname
## /Users/emmasun/Desktop/Capstone/en_US.news.txt emmasun staff
head(news, 5)
## [1] "He wasn't home alone, apparently."
## [2] "The St. Louis plant had to close. It would die of old age. Workers had been making cars there since the onset of mass automotive production in the 1920s."
## [3] "WSU's plans quickly became a hot topic on local online sites. Though most people applauded plans for the new biomedical center, many deplored the potential loss of the building."
## [4] "The Alaimo Group of Mount Holly was up for a contract last fall to evaluate and suggest improvements to Trenton Water Works. But campaign finance records released this week show the two employees donated a total of $4,500 to the political action committee (PAC) Partners for Progress in early June. Partners for Progress reported it gave more than $10,000 in both direct and in-kind contributions to Mayor Tony Mack in the two weeks leading up to his victory in the mayoral runoff election June 15."
## [5] "And when it's often difficult to predict a law's impact, legislators should think twice before carrying any bill. Is it absolutely necessary? Is it an issue serious enough to merit their attention? Will it definitely not make the situation worse?"
stri_stats_general(blogs)
## Lines LinesNEmpty Chars CharsNWhite
## 899288 899288 206824382 170389539
stri_stats_general(twitter)
## Lines LinesNEmpty Chars CharsNWhite
## 2360148 2360148 162096031 134082634
stri_stats_general(news)
## Lines LinesNEmpty Chars CharsNWhite
## 1010242 1010242 203223154 169860866
blogs.words <- stri_count_words(blogs)
summary(blogs.words)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 9.00 28.00 41.75 60.00 6726.00
news.words <- stri_count_words(news)
summary(news.words)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 19.00 32.00 34.41 46.00 1796.00
twitter.words <- stri_count_words(twitter)
summary(twitter.words)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 7.00 12.00 12.75 18.00 47.00
# Sample the data
set.seed(679)
data.sample <- c(sample(blogs, length(blogs) * 0.01),sample(news, length(news) * 0.01),sample(twitter, length(twitter) * 0.01))
# Create corpus and clean the data
corpus <- VCorpus(VectorSource(data.sample))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus <- tm_map(corpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
corpus <- tm_map(corpus, toSpace, "/|@|\\|")
# convert to lowercase
corpus <- tm_map(corpus, tolower)
# remove punctuation
corpus <- tm_map(corpus, removePunctuation)
# remove numbers
corpus <- tm_map(corpus, removeNumbers)
# strip whitespace
corpus <- tm_map(corpus, stripWhitespace)
# remove english stop words
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, PlainTextDocument)
#store the data in matrixs
m1 <- DocumentTermMatrix(corpus)
m1
## <<DocumentTermMatrix (documents: 42695, terms: 55963)>>
## Non-/sparse entries: 514074/2388826211
## Sparsity : 100%
## Maximal term length: 112
## Weighting : term frequency (tf)
m2 <- TermDocumentMatrix(corpus)
m2
## <<TermDocumentMatrix (terms: 55963, documents: 42695)>>
## Non-/sparse entries: 514074/2388826211
## Sparsity : 100%
## Maximal term length: 112
## Weighting : term frequency (tf)
freq <- colSums(as.matrix(m1))
length(freq)
## [1] 55963
ord <- order(freq)
m <- as.matrix(m1)
dim(m)
## [1] 42695 55963
m1s <- removeSparseTerms(m1, 0.1)
#place the data in the order of frequency
freq <- colSums(as.matrix(m1s))
freq <- sort(colSums(as.matrix(m1)), decreasing=TRUE)
head(freq, 40)
## will just said one like can get time new good
## 3256 3062 2990 2869 2832 2465 2248 2157 1986 1780
## now dont day love know people back see last great
## 1780 1743 1675 1641 1621 1605 1521 1348 1300 1276
## first also think make going year well much way two
## 1274 1273 1267 1241 1228 1214 1207 1190 1166 1142
## want really today even got still thanks right years work
## 1138 1118 1111 1107 1101 1050 1044 1040 1036 1000
wordfrequency <- data.frame(word=names(freq), freq=freq)
head(wordfrequency)
## word freq
## will will 3256
## just just 3062
## said said 2990
## one one 2869
## like like 2832
## can can 2465
#present in graphs
p <- ggplot(subset(wordfrequency, freq>=2583), aes(word, freq))
p <- p + geom_bar(stat="identity")
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
p