#Task1: Getting and Cleaning Data #Introduction Large text databases in a target language are commonly used for generating language models across various applications. In this exercise, We will work with an English database but may also explore three other databases in German, Russian, and Finnish.

The goal of this project is just to display that we’ve gotten used to working with the data and that we are on track to create our prediction algorithm. The objective of this task is to familiarize ourselves with the databases and perform the necessary data cleaning. After completing this exercise, you should understand the nature of raw data and the extent of effort required for effective cleaning. When developing a language model for a new language, the first step is to comprehend the language’s structure and unique characteristics. This involves understanding its writing system, input methods, phonetic patterns, and grammatical rules. You can learn this by studying the language directly or analyzing available datasets and literature.

The motivation for this project is to: 1. Demonstrate that you’ve downloaded the data and have successfully loaded it in. 2. Create a basic report of summary statistics about the data sets. 3. Report any interesting findings that you amassed so far. 4. Get feedback on your plans for creating a prediction algorithm and Shiny app.

#loading required packages
library(tm)
## Loading required package: NLP
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate

#Downloading and loading files from websites

url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
download.file(url, dest = "capstone_dataset.zip")
unzip ("capstone_dataset.zip")

Reading the files

url_directory <- "/Users/mac/Desktop/AnasR/final/en_US/"
url_twitter <- paste(url_directory, "en_US.twitter.txt", sep = "")
url_blogs <- paste(url_directory, "en_US.blogs.txt", sep = "")
url_news <- paste(url_directory, "en_US.news.txt", sep = "")


twitter_file <- readLines(url_twitter, encoding="UTF-8")
## Warning in readLines(url_twitter, encoding = "UTF-8"): line 167155 appears to
## contain an embedded nul
## Warning in readLines(url_twitter, encoding = "UTF-8"): line 268547 appears to
## contain an embedded nul
## Warning in readLines(url_twitter, encoding = "UTF-8"): line 1274086 appears to
## contain an embedded nul
## Warning in readLines(url_twitter, encoding = "UTF-8"): line 1759032 appears to
## contain an embedded nul
blogs_file <- readLines(url_blogs, encoding="UTF-8")
news_file <- readLines(url_news, encoding="UTF-8")

#Getting badwords file from web

badwords_url <-"http://www.cs.cmu.edu/~biglou/resources/bad-words.txt"
download.file(badwords_url, destfile = "bad-words.txt")
badwords <- readLines("bad-words.txt", encoding="UTF-8")

#Setting and subsetting the data

set.seed(2025)
subset_all_data <- c(sample(twitter_file, 6000), sample(blogs_file, 6000), sample(news_file, 6000))

#Cleaning and organizing the data

clean_data <- subset_all_data

clean_data <- iconv(clean_data, "UTF-8", "ASCII", sub = "")

clean_data <- removeNumbers(clean_data)

clean_data <- stripWhitespace(clean_data) 

clean_data <- tolower(clean_data) 

clean_data <- removePunctuation(clean_data)

clean_data <- removeWords(clean_data, c(badwords))

#Getting information about the files

data<- data.frame(file = c("Twitter", "Blogs", "News"),
           size_MB = c(format(object.size(twitter_file), "MB"),
                       format(object.size(blogs_file), "MB"),
                       format(object.size(news_file), "MB")),
           lines = c(length(readLines(url_twitter)),
                     length(readLines(url_blogs)),
                     length(readLines(url_news))),
           longest_line = c(summary(nchar(twitter_file))[6],
                            summary(nchar(blogs_file))[6],
                            summary(nchar(news_file))[6])
)
## Warning in readLines(url_twitter): line 167155 appears to contain an embedded
## nul
## Warning in readLines(url_twitter): line 268547 appears to contain an embedded
## nul
## Warning in readLines(url_twitter): line 1274086 appears to contain an embedded
## nul
## Warning in readLines(url_twitter): line 1759032 appears to contain an embedded
## nul
data
##      file  size_MB   lines longest_line
## 1 Twitter   319 Mb 2360148          140
## 2   Blogs 255.4 Mb  899288        40833
## 3    News 257.3 Mb 1010242        11384

The file contains three rows and 4 columns,Twitter has the higher number of lines, while the blogs contain the longest line

#Summary of the data using ggplot

ggplot(data, aes(x = file, y = longest_line, fill = file)) +
  geom_bar(stat = "identity") +  # Use stat="identity" to plot specified values
  labs(title = "Bar Plot of File Types", x = "File Type", y = "Longest Line") +
  theme_minimal()