** OBJECTIVE**
The goal of this project is just to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm. Please submit a report on R Pubs (http://rpubs.com/) that explains your exploratory analysis and your goals for the eventual app and algorithm. This document should be concise and explain only the major features of the data you have identified and briefly summarize your plans for creating the prediction algorithm and Shiny app in a way that would be understandable to a non-data scientist manager. You should make use of tables and plots to illustrate important summaries of the data set.
The motivation for this project is to:
library(tm)
library(RWeka)
library(openNLP)
library(tau)
library(Rstem)
library(SnowballC)
library(quanteda)
library(stringr)
library(slam)
library(stylo)
setting of directory and creation of files:
## Length Class Mode
## 77259 character character
## Length Class Mode
## 899288 character character
## Length Class Mode
## 2360148 character character
First I analyse the number of lines of each file
library(ggplot2)
numlines <- c(length(blogs),length(news),length(tweets))
numlines <- data.frame(numlines)
numlines$names <- c("blogs","news","twitter")
ggplot(numlines,aes(x=names,y=numlines)) + geom_bar(stat='identity', fill="blue", color='blue') + xlab('File source') + ylab('Total No. of Lines') + ggtitle('Total Line Count per File Source')
library(R.utils)
## Loading required package: R.oo
## Loading required package: R.methodsS3
## R.methodsS3 v1.7.0 (2015-02-19) successfully loaded. See ?R.methodsS3 for help.
## R.oo v1.19.0 (2015-02-27) successfully loaded. See ?R.oo for help.
##
## Attaching package: 'R.oo'
##
## The following objects are masked from 'package:methods':
##
## getClasses, getMethods
##
## The following objects are masked from 'package:base':
##
## attach, detach, gc, load, save
##
## R.utils v2.2.0 (2015-12-09) successfully loaded. See ?R.utils for help.
##
## Attaching package: 'R.utils'
##
## The following object is masked from 'package:utils':
##
## timestamp
##
## The following objects are masked from 'package:base':
##
## cat, commandArgs, getOption, inherits, isOpen, parse, warnings
News.words <- sum(sapply(gregexpr("\\W+", news), length) + 1)
blogs.words <- sum(sapply(gregexpr("\\W+", blogs), length) + 1)
tweets.words <- sum(sapply(gregexpr("\\W+", tweets), length) + 1)
numwords <- c(News.words, blogs.words,tweets.words)
numwords <- data.frame(numwords)
numwords$names <- c("news","blogs","twitter")
#ggplot(numlines,aes(x=names,y=numwords)) + geom_bar(stat='identity', fill="red", color='blue') + xlab('File source') + ylab('Total No. of words') + ggtitle('Total Words Count per File Source')
Analize size of files to check if they are too big
## Analize size of files
dir<-"D:/personal/data science/Capstone Project/final/en_US"
size.news<-file.info(file.path(dir,"en_US.news.txt"))$size/1000^2
size.blogs<-file.info(file.path(dir,"en_US.blogs.txt"))$size/1000^2
size.tweets<-file.info(file.path(dir,"en_US.twitter.txt"))$size/1000^2
sizefiles <- c(size.news,size.blogs , size.tweets)
sizefiles <- data.frame(sizefiles)
sizefiles$names <- c("news","blogs","twitter")
#ggplot(numlines,aes(x=names,y=sizefiles)) + geom_bar(stat='identity', fill="green", color='blue') + xlab('File source') + ylab('Size of each file') + ggtitle('Total size per File Source')
The files are too big, so I need to work with a smaller sample
SampleTweets=sample(tweets, 1000)
SampleBlogs=sample(blogs, 1000)
SampleNews=sample(news, 1000)
total.samples=paste(SampleBlogs,SampleNews,SampleTweets, sep=" ")
summary(total.samples)
## Length Class Mode
## 1000 character character
## The first
SampleTweets[1:3]
## [1] "Uh, can I teach at West Nottingham Academy?! Those PLPeeps are doing cool things with alt. assessments."
## [2] "Award to Ranking Joe"
## [3] "#blue and #truth about success"
SampleBlogs[220]
## [1] "THIS IS SPINAL TAP (Christopher Guest, Michael McKean, Harry Shearer, Rob Reiner)"
SampleNews[50]
## [1] "Indians President Mark Shapiro seemed to be having a blast last week as employees and invited guests brought their kids to try out the new spaces."
After checking data. Cleaning is the first step. But stopwords won´t be deleted because of the goal of the Capstone Project.
The next plan will it be: