#Task1: Getting and Cleaning Data #Introduction Large text databases in a target language are commonly used for generating language models across various applications. In this exercise, We will work with an English database but may also explore three other databases in German, Russian, and Finnish.
The goal of this project is just to display that we’ve gotten used to working with the data and that we are on track to create our prediction algorithm. The objective of this task is to familiarize ourselves with the databases and perform the necessary data cleaning. After completing this exercise, you should understand the nature of raw data and the extent of effort required for effective cleaning. When developing a language model for a new language, the first step is to comprehend the language’s structure and unique characteristics. This involves understanding its writing system, input methods, phonetic patterns, and grammatical rules. You can learn this by studying the language directly or analyzing available datasets and literature.
The motivation for this project is to: 1. Demonstrate that you’ve downloaded the data and have successfully loaded it in. 2. Create a basic report of summary statistics about the data sets. 3. Report any interesting findings that you amassed so far. 4. Get feedback on your plans for creating a prediction algorithm and Shiny app.
#loading required packages
library(tm)
## Loading required package: NLP
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
#Downloading and loading files from websites
url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
download.file(url, dest = "capstone_dataset.zip")
unzip ("capstone_dataset.zip")
url_directory <- "/Users/mac/Desktop/AnasR/final/en_US/"
url_twitter <- paste(url_directory, "en_US.twitter.txt", sep = "")
url_blogs <- paste(url_directory, "en_US.blogs.txt", sep = "")
url_news <- paste(url_directory, "en_US.news.txt", sep = "")
twitter_file <- readLines(url_twitter, encoding="UTF-8")
## Warning in readLines(url_twitter, encoding = "UTF-8"): line 167155 appears to
## contain an embedded nul
## Warning in readLines(url_twitter, encoding = "UTF-8"): line 268547 appears to
## contain an embedded nul
## Warning in readLines(url_twitter, encoding = "UTF-8"): line 1274086 appears to
## contain an embedded nul
## Warning in readLines(url_twitter, encoding = "UTF-8"): line 1759032 appears to
## contain an embedded nul
blogs_file <- readLines(url_blogs, encoding="UTF-8")
news_file <- readLines(url_news, encoding="UTF-8")
#Getting badwords file from web
badwords_url <-"http://www.cs.cmu.edu/~biglou/resources/bad-words.txt"
download.file(badwords_url, destfile = "bad-words.txt")
badwords <- readLines("bad-words.txt", encoding="UTF-8")
#Setting and subsetting the data
set.seed(2025)
subset_all_data <- c(sample(twitter_file, 6000), sample(blogs_file, 6000), sample(news_file, 6000))
#Cleaning and organizing the data
clean_data <- subset_all_data
clean_data <- iconv(clean_data, "UTF-8", "ASCII", sub = "")
clean_data <- removeNumbers(clean_data)
clean_data <- stripWhitespace(clean_data)
clean_data <- tolower(clean_data)
clean_data <- removePunctuation(clean_data)
clean_data <- removeWords(clean_data, c(badwords))
#Getting information about the files
data<- data.frame(file = c("Twitter", "Blogs", "News"),
size_MB = c(format(object.size(twitter_file), "MB"),
format(object.size(blogs_file), "MB"),
format(object.size(news_file), "MB")),
lines = c(length(readLines(url_twitter)),
length(readLines(url_blogs)),
length(readLines(url_news))),
longest_line = c(summary(nchar(twitter_file))[6],
summary(nchar(blogs_file))[6],
summary(nchar(news_file))[6])
)
## Warning in readLines(url_twitter): line 167155 appears to contain an embedded
## nul
## Warning in readLines(url_twitter): line 268547 appears to contain an embedded
## nul
## Warning in readLines(url_twitter): line 1274086 appears to contain an embedded
## nul
## Warning in readLines(url_twitter): line 1759032 appears to contain an embedded
## nul
data
## file size_MB lines longest_line
## 1 Twitter 319 Mb 2360148 140
## 2 Blogs 255.4 Mb 899288 40833
## 3 News 257.3 Mb 1010242 11384
The file contains three rows and 4 columns,Twitter has the higher number of lines, while the blogs contain the longest line
#Summary of the data using ggplot
ggplot(data, aes(x = file, y = longest_line, fill = file)) +
geom_bar(stat = "identity") + # Use stat="identity" to plot specified values
labs(title = "Bar Plot of File Types", x = "File Type", y = "Longest Line") +
theme_minimal()