Milestone Report

Data Exploration

Dataset for this project is available at :https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip

We use English database which have three text files consisting of twitter, blogs and news text.

1.Loading Necessary Libraries

library(NLP)
library(R.utils)
library(tidytext)
library(tidyverse)
library(quanteda)
library(tm)
library(knitr)

2.Importing the Data

Reading the data.

twitter<-tibble(source="twitter",text=(readLines("./data/final/en_US/en_US.twitter.txt",encoding = "UTF-8",skipNul = TRUE)))
blogs<-tibble(source="blogs",text=readLines("./data/final/en_US/en_US.blogs.txt",encoding = "UTF-8",skipNul = TRUE))
news<-tibble(source="news",text=readLines("./data/final/en_US/en_US.news.txt",encoding = "UTF-8",skipNul = TRUE))

## Warning in readLines("./data/final/en_US/en_US.news.txt", encoding = "UTF-8", :
## incomplete final line found on './data/final/en_US/en_US.news.txt'

masterdata<-rbind(twitter,blogs,news)
library(caret)

## Warning: package 'caret' was built under R version 4.2.2

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

inTrain<-createDataPartition(y=masterdata$source,p=0.8,list=FALSE)
data<-masterdata[-inTrain,]

3. Exploratory analysis

Total lines in Each data set.

data%>%group_by(source)%>%
    summarise(total_lines=length(text))%>%
    kable()

source	total_lines
blogs	179857
news	15451
twitter	472029

Word count in each dataset

tidy<-data%>%unnest_tokens(word,text)
tidy%>%group_by(source)%>%
count(source, sort = TRUE)%>% kable()

source	n
blogs	7510821
twitter	6017211
news	533733

Distribution of word frequencies (top-20)

library(tm)
data("stop_words")
tidy_sw<-tidy%>%
    anti_join(stop_words)

## Joining, by = "word"

tidy_sw%>%
count(word, sort = TRUE) %>%
top_n(20)

## Selecting by n

## # A tibble: 20 × 2
##    word        n
##    <chr>   <int>
##  1 time    34183
##  2 love    30555
##  3 day     29238
##  4 people  23108
##  5 rt      17898
##  6 3       15843
##  7 2       15415
##  8 life    15382
##  9 lol     14799
## 10 night   12587
## 11 happy   12573
## 12 1       11982
## 13 week    11427
## 14 home    11090
## 15 follow  10694
## 16 world   10362
## 17 hope    10219
## 18 feel    10001
## 19 tonight  9608
## 20 4        8784

uni20<-tidy%>%
count(word, sort = TRUE) %>%
top_n(20)

## Selecting by n

unibar<- ggplot(data = uni20, aes(x = reorder(word, -n), y = n)) +
        geom_bar(stat = "identity", fill = "orange") +
        xlab("Words") +
        ylab("Frequency") +
        ggtitle(paste("Top 20 unigrams")) +
        theme(plot.title = element_text(hjust = 0.5)) +
        theme(axis.text.x = element_text(angle = 60, hjust = 1))
unibar

Frequencies of 2-grams and 3-grams in the dataset?

Bigrams frequency.

data_bigram <- data %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2)
bi10<-data_bigram %>%
count(bigram, sort = TRUE)%>%
    top_n(20)

## Selecting by n

bi10

## # A tibble: 20 × 2
##    bigram       n
##    <chr>    <int>
##  1 of the   51297
##  2 in the   49009
##  3 for the  27389
##  4 to the   27177
##  5 on the   25752
##  6 to be    23700
##  7 at the   17849
##  8 i have   16054
##  9 and the  15506
## 10 i was    15172
## 11 is a     15106
## 12 i am     14654
## 13 in a     14489
## 14 and i    14291
## 15 it was   14155
## 16 it is    13537
## 17 with the 13238
## 18 for a    12975
## 19 if you   12933
## 20 have a   12310

bibar<- ggplot(data = bi10, aes(x = reorder(bigram, -n), y = n)) +
        geom_bar(stat = "identity", fill = "yellow") +
        xlab("Words") +
        ylab("Frequency") +
        ggtitle(paste("Top 20 bigrams")) +
        theme(plot.title = element_text(hjust = 0.5)) +
        theme(axis.text.x = element_text(angle = 60, hjust = 1))
bibar

Trigrams frequency.

data_trigram <- data %>%
unnest_tokens(trigram, text, token = "ngrams", n = 3)
tri20<-data_trigram %>%filter(trigram !="<NA>")%>%
count(trigram, sort = TRUE)%>%
    top_n(20)

## Selecting by n

tri20

## # A tibble: 20 × 2
##    trigram                n
##    <chr>              <int>
##  1 thanks for the      4794
##  2 one of the          4198
##  3 a lot of            3859
##  4 i want to           2767
##  5 to be a             2596
##  6 going to be         2522
##  7 i have a            2284
##  8 looking forward to  2150
##  9 it was a            2125
## 10 i have to           2058
## 11 thank you for       2013
## 12 i need to           1893
## 13 the end of          1877
## 14 be able to          1873
## 15 i love you          1861
## 16 out of the          1853
## 17 some of the         1717
## 18 as well as          1644
## 19 the rest of         1641
## 20 can't wait to       1627

tribar<- ggplot(data = tri20, aes(x = reorder(trigram, -n), y = n, na.rm = TRUE)) +
        geom_bar(stat = "identity", fill = "green") +
        xlab("Words") +
        ylab("Frequency") +
        ggtitle(paste("Top 20 trigrams")) +
        theme(plot.title = element_text(hjust = 0.5)) +
        theme(axis.text.x = element_text(angle = 60, hjust = 1))
tribar

Extra Questions

How many unique words do you need in a frequency sorted dictionary to cover 50% of all word instances in the language? 90%?

How do you evaluate how many of the words come from foreign languages?

Can you think of a way to increase the coverage – identifying words that may not be in the corpora or using a smaller number of words in the dictionary to cover the same number of phrases?

Milestone Report

Varun

2022-12-25

Project Brief

Background

Goal