load the data

if(!file.exists("Coursera-SwiftKey.zip")){
        download.file("https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip", "Coursera-SwiftKey.zip")
}
#unzip("Coursera-SwiftKey.zip")
blogs_data <- readLines("en_US.blogs.txt", skipNul = TRUE, encoding = "UTF-8")
news_data <- readLines("en_US.news.txt", skipNul = TRUE, encoding = "UTF-8")
twitter_data <- readLines("en_US.twitter.txt", skipNul = TRUE, encoding = "UTF-8")

Basic Info - Number of lines

print("Number of Lines News File:"); print(length(news_data));
## [1] "Number of Lines News File:"
## [1] 1010242
print("Number of Lines blogs File:");length(blogs_data);
## [1] "Number of Lines blogs File:"
## [1] 899288
print("Number of Lines twitter File:");length(twitter_data)
## [1] "Number of Lines twitter File:"
## [1] 2360148

Basic Info - Longest line

print("Max # Chars in line blogs File:"); max(nchar(blogs_data));
## [1] "Max # Chars in line blogs File:"
## [1] 40833
print("Max # Chars in line news File:"); max(nchar(news_data));
## [1] "Max # Chars in line news File:"
## [1] 11384
print("Max # Chars in line twitter File:"); max(nchar(twitter_data))
## [1] "Max # Chars in line twitter File:"
## [1] 140

Basic Info - shortest line

print("Max # Chars in line blogs File:"); min(nchar(blogs_data));
## [1] "Max # Chars in line blogs File:"
## [1] 1
print("Max # Chars in line news File:"); min(nchar(news_data));
## [1] "Max # Chars in line news File:"
## [1] 1
print("Max # Chars in line twitter File:"); min(nchar(twitter_data))
## [1] "Max # Chars in line twitter File:"
## [1] 2

Basic Info - average line

print("Max # Chars in line blogs File:"); mean(nchar(blogs_data));
## [1] "Max # Chars in line blogs File:"
## [1] 229.987
print("Max # Chars in line news File:"); mean(nchar(news_data));
## [1] "Max # Chars in line news File:"
## [1] 201.1628
print("Max # Chars in line twitter File:"); mean(nchar(twitter_data))
## [1] "Max # Chars in line twitter File:"
## [1] 68.68054

Examples for each dataset

print("Examples of News")
## [1] "Examples of News"
news_data[1];news_data[2];news_data[3];
## [1] "He wasn't home alone, apparently."
## [1] "The St. Louis plant had to close. It would die of old age. Workers had been making cars there since the onset of mass automotive production in the 1920s."
## [1] "WSU's plans quickly became a hot topic on local online sites. Though most people applauded plans for the new biomedical center, many deplored the potential loss of the building."
print("Examples of blogs")
## [1] "Examples of blogs"
blogs_data[1];blogs_data[3];blogs_data[2];
## [1] "In the years thereafter, most of the Oil fields and platforms were named after pagan <U+201C>gods<U+201D>."
## [1] "Chad has been awesome with the kids and holding down the fort while I work later than usual! The kids have been busy together playing Skylander on the XBox together, after Kyan cashed in his $$$ from his piggy bank. He wanted that game so bad and used his gift card from his birthday he has been saving and the money to get it (he never taps into that thing either, that is how we know he wanted it so bad). We made him count all of his money to make sure that he had enough! It was very cute to watch his reaction when he realized he did! He also does a very good job of letting Lola feel like she is playing too, by letting her switch out the characters! She loves it almost as much as him."
## [1] "We love you Mr. Brown."
print("Examples of Twitter")
## [1] "Examples of Twitter"
twitter_data[1];twitter_data[2];twitter_data[3]
## [1] "How are you? Btw thanks for the RT. You gonna be in DC anytime soon? Love to see you. Been way, way too long."
## [1] "When you meet someone special... you'll know. Your heart will beat more rapidly and you'll smile for no reason."
## [1] "they've decided its more fun if I don't."

Distribution of the Number of characters

par( mfrow = c( 1, 3 ) )
boxplot(nchar(blogs_data), ylim = c(0,400), main = "Blogs", ylab = "Number of Characters");
boxplot(nchar(news_data), ylim = c(0,400), main = "News", ylab = "Number of Characters");
boxplot(nchar(twitter_data), ylim = c(0,400), main = "Twitter", ylab = "Number of Characters")

par( mfrow = c( 1, 1 ) )