DSClatest

setwd("C:/Users/Dmbewe/OneDrive - WRHI/Desktop/Coursera-SwiftKey/final/en_US")

trainURL <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
trainDataFile <- "data/Coursera-SwiftKey.zip"

if (!file.exists('data')) {
    dir.create('data')
}

if (!file.exists("data/final/en_US")) {
    tempFile <- tempfile()
    download.file(trainURL, tempFile)
    unzip(tempFile, exdir = "data")
    unlink(tempFile)
}

blogsFileName <- "data/final/en_US/en_US.blogs.txt"
con <- file(blogsFileName, open = "r")
blogs <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)

newsFileName <- "data/final/en_US/en_US.news.txt"
con <- file(newsFileName, open = "r")
news <- readLines(con, encoding = "UTF-8", skipNul = TRUE)

## Warning in readLines(con, encoding = "UTF-8", skipNul = TRUE): incomplete final
## line found on 'data/final/en_US/en_US.news.txt'

close(con)

twitterFileName <- "data/final/en_US/en_US.twitter.txt"
con <- file(twitterFileName, open = "r")
twitter <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)

rm(con)

sampleSize = 0.01

fileSizeMB <- round(file.info(c(blogsFileName,
                                newsFileName,
                                twitterFileName))$size / 1024 ^ 2)

library(stringi)
library(kableExtra)

## Warning: package 'kableExtra' was built under R version 4.1.2

numLines <- sapply(list(blogs, news, twitter), length)

numChars <- sapply(list(nchar(blogs), nchar(news), nchar(twitter)), sum)

numWords <- sapply(list(blogs, news, twitter), stri_stats_latex)[4,]

wpl <- lapply(list(blogs, news, twitter), function(x) stri_count_words(x))

wplSummary = sapply(list(blogs, news, twitter),
             function(x) summary(stri_count_words(x))[c('Min.', 'Mean', 'Max.')])
rownames(wplSummary) = c('WPL.Min', 'WPL.Mean', 'WPL.Max')

summary <- data.frame(
    File = c("en_US.blogs.txt", "en_US.news.txt", "en_US.twitter.txt"),
    FileSize = paste(fileSizeMB, " MB"),
    Lines = numLines,
    Characters = numChars,
    Words = numWords,
    t(rbind(round(wplSummary)))
)

print(summary)

##                File FileSize   Lines Characters    Words WPL.Min WPL.Mean
## 1   en_US.blogs.txt  200  MB  899288  206824505 37570839       0       42
## 2    en_US.news.txt  196  MB   77259   15639408  2651432       1       35
## 3 en_US.twitter.txt  159  MB 2360148  162096241 30451170       1       13
##   WPL.Max
## 1    6726
## 2    1123
## 3      47

library(ggplot2)
library(gridExtra)

plot1 <- qplot(wpl[[1]],
               geom = "histogram",
               main = "US Blogs",
               xlab = "Words per Line",
               ylab = "Frequency",
               binwidth = 5)

print(plot1)

plot2 <- qplot(wpl[[2]],
               geom = "histogram",
               main = "US News",
               xlab = "Words per Line",
               ylab = "Frequency",
               binwidth = 5)

plot2

plot3 <- qplot(wpl[[3]],
               geom = "histogram",
               main = "US Twitter",
               xlab = "Words per Line",
               ylab = "Frequency",
               binwidth = 1)

plot3

SUMMARY AND FUTURE PLANS ## These text files are large, more than 200 The the data appears to shows that Blogs tend to have more words per line, followed by news and then twitter. Twitter has the lowest number of words per line.
Going forward Our predictive algorithm will be using n-gram model on more cleaner data

DSClatest

Dorica Mbewe

27/11/2021