URLfile <- "C:/Users/menno_000/Documents/R/Course data analytics/Capstone/final/en_US/"
blog_info <- paste0(URLfile,"en_US.blogs.txt")
news_info <- paste0(URLfile,"en_US.news.txt")
twitter_info <- paste0(URLfile,"en_US.twitter.txt")
##Reading blogs
con <- file(blog_info, "r")
blogtext <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)
## Reading news
con <- file(news_info, "r")
newstext <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)
## Reading twitter
con <- file(twitter_info, "r")
twittertext <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)
rm(con)
Splitsenbestand <- function (splitsbestand, bestandslocatie, soort){
## sampling preprocessing
set.seed(1000) # standard for sampling
lengte <- length(splitsbestand)
trainset <- rbinom(lengte, 1, 0.05)
lengtetrainset <- sum(trainset)
## splitting the file in train, work and testset
splitsbestand_train <- splitsbestand[trainset == 1]
splitsbestand_test <- splitsbestand[!trainset == 1]
##writing the dataset test
con <- file(paste0(bestandslocatie,"en_US.", soort, "test.txt"), "w")
cat(splitsbestand_test, file = con, sep = "\n")
close(con)
##writing the dataset train
con <- file(paste0(bestandslocatie,"en_US.", soort, "train.txt"), "w")
cat(splitsbestand_train, file = con, sep = "\n")
close(con)
}
##splitsbestand, lijst, bestandslocatie, soort
Splitsenbestand(blogtext, URLfile, "blog")
Splitsenbestand(newstext, URLfile, "news")
Splitsenbestand(twittertext, URLfile, "twitter")
In this chunk the workingset is read so the rest of the assignment can be done is a speady way. In this chunk the three sets are combined to workingset with all the three workingfiles in it and it will be written to a file for later use as well.
## Reading blogs
con <- file(paste0(URLfile,"/en_US.blogtrain.txt"), "r")
blogtext <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)
## Reading news
con <- file(paste0(URLfile,"/en_US.newstrain.txt"), "r")
newstext <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)
## Reading twitter
con <- file(paste0(URLfile,"/en_US.twittertrain.txt"), "r")
twittertext <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)
rm(con)
## Combining the files and write it to combine file
Totaltext <- c(blogtext, newstext, twittertext, recursive = FALSE)
con <- file(paste0(URLfile,"/en_US.combined.txt"))
writeLines(Totaltext, con)
close(con)
# Removing the 3 files for memory reasons
rm(con, blogtext, newstext, twittertext)