Milestone Report for the Data Science Capstone course

1. Download the Data and Load it into RStudio

1.1 Prepare the environment

Load the required R libraries.

library(dplyr)
library(tm)
library(ggplot2)
library(stringi)
library(wordcloud)
library(slam)
library(SnowballC)

1.2 Download the Data Files

We will be using the English training files (“en_US”). We check if the files exist in the working directory and download them and unzip them if necessary.

# Check for file and download if necessary
if (!file.exists("data/Coursera-SwiftKey.zip")) {
    download.file("https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip", 
        destfile = "data/Coursera-SwiftKey.zip")
}
# Check for data file and unzip if necessary
if (!file.exists("data/final/en_US/en_US.blogs.txt")) {
    unzip("data/Coursera-SwiftKey.zip", exdir = "data/final/en_US", list = TRUE)
}

1.3 Loading Raw Data

We load the three data files into RStudio in binary code, not to lose any characters.

# Read blogs data
conn <- file("data/final/en_US/en_US.blogs.txt", open = "rb")
blogs <- readLines(conn, encoding = "UTF-8")
close(conn)

# Read news data
conn <- file("data/final/en_US/en_US.news.txt", open = "rb")
news <- readLines(conn, encoding = "UTF-8")
close(conn)

# Read twitter data
conn <- file("data/final/en_US/en_US.twitter.txt", open = "rb")
tweets <- readLines(conn, encoding = "UTF-8")
close(conn)

rm(conn)

2. Summary Statistics

2.1 Words Per Line (WPL)

We start by analysing some basic statistics, such as the Line, Character and Word counts. We also analyse the Words Per Line (WPL) data, to gain some insight on the type of text analysed.

# Compute words per line info on each line for each data type
rawWPL<-lapply(list(blogs,news,tweets),function(x) stri_count_words(x))

# Compute statistics and summary info for each data type
rawstats<-data.frame(
            File=c("blogs","news","twitter"), 
            t(rbind(sapply(list(blogs,news,tweets),stri_stats_general),
                    TotalWords=sapply(list(blogs,news,tweets),stri_stats_latex)[4,])),
            # Compute words per line summary
            WPL=rbind(summary(rawWPL[[1]]),summary(rawWPL[[2]]),summary(rawWPL[[3]]))
            )
print(rawstats)
##      File   Lines LinesNEmpty     Chars CharsNWhite TotalWords WPL.Min.
## 1   blogs  899288      899288 206824382   170389539   37570839        0
## 2    news 1010242     1010242 203223154   169860866   34494539        1
## 3 twitter 2360148     2360148 162096031   134082634   30451128        1
##   WPL.1st.Qu. WPL.Median WPL.Mean WPL.3rd.Qu. WPL.Max.
## 1           9         28 41.75107          60     6726
## 2          19         32 34.40997          46     1796
## 3           7         12 12.75063          18       47

From the table, we can see that blogs contain the most words per line (WPL mean=41.75), followed by news (WPL mean=34.41) and finally tweets (WPL mean= 12.75). WPL can give us help us infer about the complexity of the text and the attention span of the typical reader.

2.2 Histograms of WPL

# Plot WPL histogram for blogs
qplot(rawWPL[[1]],geom="histogram",main="Histogram for US Blogs",
      xlab="No. of Words",ylab="Frequency",binwidth=10)

# Plot WPL histogram for news
qplot(rawWPL[[2]],geom="histogram",main="Histogram for US News",
      xlab="No. of Words",ylab="Frequency",binwidth=10)

# Plot WPL histogram for tweets
qplot(rawWPL[[3]],geom="histogram",main="Histogram for US Tweets",
      xlab="No. of Words",ylab="Frequency",binwidth=1)

rm(rawWPL);rm(rawstats)

The histograms are all right-skewed to varying degrees. This implies a preference towards short and concise comunication.

3. Data Sampling

As the datasets are of sizeable dimentions, we will sample 30000 lines before cleaning the data and removing profanities.

samplesize <- 30000  # Assign sample size
set.seed(1607)  # Ensure reproducibility 

# Create raw data and sample vectors
data <- list(blogs, news, tweets)
sample <- list()

# Iterate each raw data to create 'cleaned' sample for each
for (i in 1:length(data)) {
    # Create sample dataset
    Filter <- sample(1:length(data[[i]]), samplesize, replace = FALSE)
    sample[[i]] <- data[[i]][Filter]
    # Remove unconvention/funny characters
    for (j in 1:length(sample[[i]])) {
        row1 <- sample[[i]][j]
        row2 <- iconv(row1, "latin1", "ASCII", sub = "")
        sample[[i]][j] <- row2
    }
}

rm(blogs)
rm(news)
rm(tweets)

3.1 Cleaning the Data Sample

We will create a corpus for each type of text, which will then be cleaned (i.e. converting text to lowercase, removing numbers, punctuation and profanities). The corpus will also be stemmed to eliminate the duplications of similar words (e.g. “connect”, “connecting”). Finally, a document term matric is created to understand term frequencies in the texts.

# Create corpus and document term matrix vectors
corpus <- list()
dtMatrix <- list()

# Iterate each sample data to create corpus and DTM for each
for (i in 1:length(sample)) {
    # Create corpus dataset
    corpus[[i]] <- VCorpus(VectorSource(sample[[i]]))
    # Cleaning/stemming the data
    corpus[[i]] <- tm_map(corpus[[i]], tolower)
    corpus[[i]] <- tm_map(corpus[[i]], removeNumbers)
    corpus[[i]] <- tm_map(corpus[[i]], removeWords, stopwords("english"))
    corpus[[i]] <- tm_map(corpus[[i]], removePunctuation)
    corpus[[i]] <- tm_map(corpus[[i]], stemDocument)
    corpus[[i]] <- tm_map(corpus[[i]], stripWhitespace)
    corpus[[i]] <- tm_map(corpus[[i]], PlainTextDocument)
    # calculate document term frequency for corpus
    dtMatrix[[i]] <- DocumentTermMatrix(corpus[[i]], control = list(wordLengths = c(0, 
        Inf)))
}

rm(data)
rm(sample)

4. Word Cloud

We will visualise the data via word clouds. The more frequent a word is, the larger and more central is on the cloud.

set.seed(1607)  # Ensure reproducibility
par(mfrow = c(1, 3))  # Establish Plotting Panel
headings = c("US Blogs Word Cloud", "US News Word Cloud", "US Tweets Word Cloud")

# Iterate each corpus/DTM and plot word cloud for each
for (i in 1:length(corpus)) {
    wordcloud(words = colnames(dtMatrix[[i]]), freq = col_sums(dtMatrix[[i]]), 
        scale = c(3, 1), max.words = 100, random.order = FALSE, rot.per = 0.35, 
        use.r.layout = FALSE, colors = brewer.pal(8, "Dark2"))
    title(headings[i])
}