The document describes the intermediate results of the capstone project. In this project text data is loaded, analyzed and used to build a test prediction model.

This document consists of: 1. loading the data 2. exploring the data 3. next steps

Loading the libs and data

Loading libs

The data table package is used to transform and filter data with high performance. NLP is used for wordcount. tm for loading docs and to transform to lower case. ngram to find the ngrams. ggplot and labling for the graphs.

library(data.table,  lib.loc="C:/TFS/Rlib/")
library(ngram,  lib.loc="C:/TFS/Rlib/")
library(reshape,  lib.loc="C:/TFS/Rlib/")
library(NLP,  lib.loc="C:/TFS/Rlib/")
library(tm,  lib.loc="C:/TFS/Rlib/")
library(labeling,  lib.loc="C:/TFS/Rlib/")
library(textcat,  lib.loc="C:/TFS/Rlib/")
library(ggplot2,  lib.loc="C:/TFS/Rlib/")

Loading the data

#Twitter
con <- file("files/final/en_US/en_US.twitter.txt", "r") 
txt = readLines(con, n=100) 
df_txt_twitter = data.frame(txt, stringsAsFactors = F) 
close(con)

#News
con <- file("files/final/en_US/en_US.news.txt", "r") 
txt = readLines(con, n=100) 
df_txt_news = data.frame(txt, stringsAsFactors = F) 
close(con)


#Blog
con <- file("files/final/en_US/en_US.blogs.txt", "r") 
txt = readLines(con, n=100) 
df_txt_blog = data.frame(txt, stringsAsFactors = F) 
close(con)

Explore data

View data

#Twitter
head(df_txt_twitter, 3)

##                                                                                                               txt
## 1   How are you? Btw thanks for the RT. You gonna be in DC anytime soon? Love to see you. Been way, way too long.
## 2 When you meet someone special... you'll know. Your heart will beat more rapidly and you'll smile for no reason.
## 3                                                                        they've decided its more fun if I don't.

#News
head(df_txt_news, 3)

##                                                                                                                                                                                 txt
## 1                                                                                                                                                 He wasn't home alone, apparently.
## 2                         The St. Louis plant had to close. It would die of old age. Workers had been making cars there since the onset of mass automotive production in the 1920s.
## 3 WSU's plans quickly became a hot topic on local online sites. Though most people applauded plans for the new biomedical center, many deplored the potential loss of the building.

#Blog
head(df_txt_blog,3)

##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    txt
## 1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     In the years thereafter, most of the Oil fields and platforms were named after pagan â<U+0080><U+009C>godsâ<U+0080>.
## 2                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               We love you Mr. Brown.
## 3 Chad has been awesome with the kids and holding down the fort while I work later than usual! The kids have been busy together playing Skylander on the XBox together, after Kyan cashed in his $$$ from his piggy bank. He wanted that game so bad and used his gift card from his birthday he has been saving and the money to get it (he never taps into that thing either, that is how we know he wanted it so bad). We made him count all of his money to make sure that he had enough! It was very cute to watch his reaction when he realized he did! He also does a very good job of letting Lola feel like she is playing too, by letting her switch out the characters! She loves it almost as much as him.

Basic stats

Count_lines and Count_words function

count_lines<-function(x) {
  dfr = data.frame(strsplit(x, "\\. |\\? "), stringsAsFactors = F)
  colnames(dfr) = "zin"
  zinnen = cbind(words = apply(dfr, 1, wordcount), dfr)
  length(zinnen$zin)
}

count_words<-function(x) {
  dfr = data.frame(strsplit(x, "\\. |\\? "), stringsAsFactors = F)
  colnames(dfr) = "zin"
  zinnen = cbind(words = apply(dfr, 1, wordcount), dfr)
  sum(zinnen$words)
}

Total and average number of words Twitter

l = sum(apply(head(df_txt_twitter, 3),1,count_lines))
w = sum(apply(head(df_txt_twitter, 3),1,count_words))
#lines
l

## [1] 9

#words
w

## [1] 51

#mean number of words:
w/l

## [1] 5.666667

Total and average number of words Blogs

l = sum(apply(head(df_txt_blog, 3),1,count_lines))
w = sum(apply(head(df_txt_blog, 3),1,count_words))
#lines
l

## [1] 6

#words
w

## [1] 161

#mean number of words:
w/l

## [1] 26.83333

Total and average number of words News

l = sum(apply(head(df_txt_news, 3),1,count_lines))
w = sum(apply(head(df_txt_news, 3),1,count_words))
#lines
l

## [1] 7

#words
w

## [1] 63

#mean number of words:
w/l

## [1] 9

Features of the data

dfCorpus = Corpus(VectorSource(df_txt_twitter))
lower_case = tm_map(dfCorpus, content_transformer(tolower))

dataframe<-data.frame(text=unlist(sapply(lower_case, `[`, "content")), 
                      stringsAsFactors=F)

#Some words are more frequent than others - what are the distributions of word frequencies?
q1 = loop_doc(1,100)
y = head(sort(q1$V1,decreasing=TRUE), n = 50)
top50 = q1[q1$V1 %in% y,]
top50 = top50[order(-V1),] 

x = c(1:length(top50$w1))
z = data.frame(cbind(x,top50))
p = ggplot(z, aes(x, V1, label = w1)) + geom_text()
p

Milestone Report - Capstone

Steve de Peijper

26 november 2016