The motivation for this project is to:
Demonstrate that you’ve downloaded the data and have successfully loaded it in. Create a basic report of summary statistics about the data sets. Report any interesting findings that you amassed so far. Get feedback on your plans for creating a prediction algorithm and Shiny app.
library(tm)
## Loading required package: NLP
setwd("E:/New folder/rongeet Machine report")
url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
if(!file.exists("Coursera-SwiftKey.zip")) {
download.file(url, "Coursera-SwiftKey.zip")
#unzip the file
unzip("Coursera-SwiftKey.zip", exdir = "Coursera-SwiftKey")
}
# Define file paths
filepath.blog <- "./Coursera-SwiftKey/final/en_US/en_US.blogs.txt"
filepath.twit <- "./Coursera-SwiftKey/final/en_US/en_US.twitter.txt"
filepath.news <- "./Coursera-SwiftKey/final/en_US/en_US.news.txt"
# Load the datasets into memory
con<- file(filepath.blog,"r")
blogs <- readLines(filepath.blog, encoding="UTF-8", skipNul = TRUE)
close(con)
con<- file(filepath.twit,"r")
twitter <- readLines(filepath.twit, encoding="UTF-8", skipNul = TRUE)
close(con)
con<- file(filepath.news,open = "rb")
news<- readLines(filepath.news, encoding="UTF-8", skipNul = TRUE)
## Warning in readLines(filepath.news, encoding = "UTF-8", skipNul = TRUE):
## incomplete final line found on './Coursera-SwiftKey/final/en_US/en_US.news.txt'
require(stringi)
## Loading required package: stringi
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
stri_stats_general( blogs )
## Lines LinesNEmpty Chars CharsNWhite
## 899288 899288 206824382 170389539
stri_stats_general( twitter)
## Lines LinesNEmpty Chars CharsNWhite
## 2360148 2360148 162096241 134082806
stri_stats_general( news )
## Lines LinesNEmpty Chars CharsNWhite
## 77259 77259 15639408 13072698
summary(blogs)
## Length Class Mode
## 899288 character character
blog_words <- stri_count_words(blogs)
summary(blog_words)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 9.00 28.00 41.75 60.00 6726.00
qplot(blog_words)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
summary(twitter)
## Length Class Mode
## 2360148 character character
twit_words <- stri_count_words(twitter)
summary( twit_words)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 7.00 12.00 12.75 18.00 47.00
qplot(twit_words)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
summary(news)
## Length Class Mode
## 77259 character character
news_words <- stri_count_words(news)
summary(news_words)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 19.00 32.00 34.62 46.00 1123.00
qplot(news_words)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.