CapStone Quiz# 1

Question 1

The en_US.blogs.txt file is how many megabytes?

setwd("D:/R/Work/CapStone/training data/final/en_US")
mb <- (file.info("en_US.blogs.txt")$size)/1024/1024
sprintf("The file is %s megabytes", mb)

## [1] "The file is 200.424207687378 megabytes"

Question 2

The en_US.twitter.txt has how many lines of text?

setwd("D:/R/Work/CapStone/training data/final/en_US")

f <- file("en_US.twitter.txt", open="rb")

nlines <- 0L
while (length(chunk <- readBin(f, "raw", 65536)) > 0) {
    nlines <- nlines + sum(chunk == as.raw(10L))
}
close(f)
sprintf("The file has %s lines.", nlines)

## [1] "The file has 2360148 lines."

Question 3

What is the longest length of the line seen in any of three en_US files?

setwd("D:/R/Work/CapStone/training data/final/en_US")

news <- file("en_US.news.txt", open="rb")
#news_lines <- readLines(file("en_US.news.txt", open="rb"))
news_lines <- readLines(news)
close(news)
newsl <- summary(nchar(news_lines))[6]

blogs <- file("en_US.blogs.txt", open="rb")
blog_lines <- readLines(blogs)
close(blogs)
blogsl <- summary(nchar(blog_lines))[6]

twitter <- file("en_US.twitter.txt", open="rb")
twitter_lines <- readLines(twitter)

## Warning in readLines(twitter): line 167155 appears to contain an embedded
## nul

## Warning in readLines(twitter): line 268547 appears to contain an embedded
## nul

## Warning in readLines(twitter): line 1274086 appears to contain an embedded
## nul

## Warning in readLines(twitter): line 1759032 appears to contain an embedded
## nul

close(twitter)
twitterl <- summary(nchar(twitter_lines))[6]

if (blogsl > newsl) 
    {
    if (blogsl > twitterl) 
        {
        largestl <- blogsl
        filenam <- "en_US.blogs.txt"
        } else if (twitterl > newsl)
        {
            largestl <- twitterl
            filenam <- "en_US.twitter.txt"
        }
} else if(newsl > twitterl) 
{
    largestl <- newsl
    filenam <- "en_US.news.txt"
        }

sprintf("The longest line is %s chars in file %s.",largestl, filenam)

## [1] "The longest line is 40835 chars in file en_US.blogs.txt."

Question 4

In the en_US twitter data set, if you divide the number of lines where the word “love” (all lowercase) occurs by the number of lines the word “hate” (all lowercase) occurs, about what do you get?

div <- length(grep("love", twitter_lines))/length(grep("hate", twitter_lines))

sprintf("We get %s",div)

## [1] "We get 4.10859156202006"

Question 5

The one tweet in the en_US twitter data set that matches the word “biostats” says what?

biostat <- grep("biostats", twitter_lines, value = T)

print(biostat)

## [1] "i know how you feel.. i have biostats on tuesday and i have yet to study =/"

Question 6

How many tweets have the exact characters “A computer once beat me at chess, but it was no match for me at kickboxing”. (I.e. the line matches those characters exactly.)

lns <- grep("A computer once beat me at chess, but it was no match for me at kickboxing", twitter_lines)

print(lns)

## [1]  519059  835824 2283423