Reading the files

URLfile <- "C:/Users/menno_000/Documents/R/Course data analytics/Capstone/final/en_US/"

blog_info <- paste0(URLfile,"en_US.blogs.txt")
news_info <- paste0(URLfile,"en_US.news.txt")
twitter_info <- paste0(URLfile,"en_US.twitter.txt")


##Reading blogs 
con <- file(blog_info, "r") 
blogtext <- readLines(con, encoding = "UTF-8", skipNul = TRUE) 
close(con)

## Reading news
con <- file(news_info, "r") 
newstext <- readLines(con, encoding = "UTF-8", skipNul = TRUE) 
close(con)

## Reading twitter 
con <- file(twitter_info, "r") 
twittertext <- readLines(con, encoding = "UTF-8", skipNul = TRUE) 
close(con)
rm(con)

Sampling the data (splitting in files)

Splitsenbestand <- function (splitsbestand, bestandslocatie, soort){
        ## sampling preprocessing
        set.seed(1000)  # standard for sampling
        lengte <- length(splitsbestand) 
        trainset <- rbinom(lengte, 1, 0.05)
        lengtetrainset <- sum(trainset)
        
        ## splitting the file in train, work and testset
        splitsbestand_train <- splitsbestand[trainset == 1]
        splitsbestand_test <- splitsbestand[!trainset == 1]
        
        ##writing the dataset test
        con <- file(paste0(bestandslocatie,"en_US.", soort, "test.txt"), "w")
        cat(splitsbestand_test, file = con, sep = "\n")
        close(con)
        
        ##writing the dataset train
        con <- file(paste0(bestandslocatie,"en_US.", soort, "train.txt"), "w")
        cat(splitsbestand_train, file = con, sep = "\n")
        close(con)
}

##splitsbestand, lijst, bestandslocatie, soort

Splitsenbestand(blogtext,  URLfile, "blog")
Splitsenbestand(newstext,  URLfile, "news")
Splitsenbestand(twittertext, URLfile, "twitter")

Reading the workingset

In this chunk the workingset is read so the rest of the assignment can be done is a speady way. In this chunk the three sets are combined to workingset with all the three workingfiles in it and it will be written to a file for later use as well.

## Reading blogs
con <- file(paste0(URLfile,"/en_US.blogtrain.txt"), "r")
blogtext <- readLines(con, encoding = "UTF-8", skipNul = TRUE) 
close(con)

## Reading news
con <- file(paste0(URLfile,"/en_US.newstrain.txt"), "r")
newstext <- readLines(con, encoding = "UTF-8", skipNul = TRUE) 
close(con)

## Reading twitter 
con <- file(paste0(URLfile,"/en_US.twittertrain.txt"), "r")
twittertext <- readLines(con, encoding = "UTF-8", skipNul = TRUE) 
close(con)
rm(con)

## Combining the files and write it to combine file
Totaltext <- c(blogtext, newstext, twittertext, recursive = FALSE)
con <- file(paste0(URLfile,"/en_US.combined.txt"))
writeLines(Totaltext, con)
close(con)

# Removing the 3 files for memory reasons
rm(con, blogtext, newstext, twittertext)