setwd("C:/Users/Dmbewe/OneDrive - WRHI/Desktop/Coursera-SwiftKey/final/en_US")
trainURL <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
trainDataFile <- "data/Coursera-SwiftKey.zip"
if (!file.exists('data')) {
dir.create('data')
}
if (!file.exists("data/final/en_US")) {
tempFile <- tempfile()
download.file(trainURL, tempFile)
unzip(tempFile, exdir = "data")
unlink(tempFile)
}
blogsFileName <- "data/final/en_US/en_US.blogs.txt"
con <- file(blogsFileName, open = "r")
blogs <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)
newsFileName <- "data/final/en_US/en_US.news.txt"
con <- file(newsFileName, open = "r")
news <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
## Warning in readLines(con, encoding = "UTF-8", skipNul = TRUE): incomplete final
## line found on 'data/final/en_US/en_US.news.txt'
close(con)
twitterFileName <- "data/final/en_US/en_US.twitter.txt"
con <- file(twitterFileName, open = "r")
twitter <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)
rm(con)
sampleSize = 0.01
fileSizeMB <- round(file.info(c(blogsFileName,
newsFileName,
twitterFileName))$size / 1024 ^ 2)
library(stringi)
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.1.2
numLines <- sapply(list(blogs, news, twitter), length)
numChars <- sapply(list(nchar(blogs), nchar(news), nchar(twitter)), sum)
numWords <- sapply(list(blogs, news, twitter), stri_stats_latex)[4,]
wpl <- lapply(list(blogs, news, twitter), function(x) stri_count_words(x))
wplSummary = sapply(list(blogs, news, twitter),
function(x) summary(stri_count_words(x))[c('Min.', 'Mean', 'Max.')])
rownames(wplSummary) = c('WPL.Min', 'WPL.Mean', 'WPL.Max')
summary <- data.frame(
File = c("en_US.blogs.txt", "en_US.news.txt", "en_US.twitter.txt"),
FileSize = paste(fileSizeMB, " MB"),
Lines = numLines,
Characters = numChars,
Words = numWords,
t(rbind(round(wplSummary)))
)
print(summary)
## File FileSize Lines Characters Words WPL.Min WPL.Mean
## 1 en_US.blogs.txt 200 MB 899288 206824505 37570839 0 42
## 2 en_US.news.txt 196 MB 77259 15639408 2651432 1 35
## 3 en_US.twitter.txt 159 MB 2360148 162096241 30451170 1 13
## WPL.Max
## 1 6726
## 2 1123
## 3 47
library(ggplot2)
library(gridExtra)
plot1 <- qplot(wpl[[1]],
geom = "histogram",
main = "US Blogs",
xlab = "Words per Line",
ylab = "Frequency",
binwidth = 5)
print(plot1)
plot2 <- qplot(wpl[[2]],
geom = "histogram",
main = "US News",
xlab = "Words per Line",
ylab = "Frequency",
binwidth = 5)
plot2
plot3 <- qplot(wpl[[3]],
geom = "histogram",
main = "US Twitter",
xlab = "Words per Line",
ylab = "Frequency",
binwidth = 1)
plot3
SUMMARY AND FUTURE PLANS ## These text files are large, more than 200 The the data appears to shows that Blogs tend to have more words per line, followed by news and then twitter. Twitter has the lowest number of words per line.
Going forward Our predictive algorithm will be using n-gram model on more cleaner data