tw=readLines(con = "data/en_US/en_US.twitter.txt")
## Warning in readLines(con = "data/en_US/en_US.twitter.txt"): line 167155 appears
## to contain an embedded nul
## Warning in readLines(con = "data/en_US/en_US.twitter.txt"): line 268547 appears
## to contain an embedded nul
## Warning in readLines(con = "data/en_US/en_US.twitter.txt"): line 1274086 appears
## to contain an embedded nul
## Warning in readLines(con = "data/en_US/en_US.twitter.txt"): line 1759032 appears
## to contain an embedded nul
blog=readLines(con = "data/en_US/en_US.blogs.txt")
news=readLines(con = "data/en_US/en_US.news.txt")
## Warning in readLines(con = "data/en_US/en_US.news.txt"): incomplete final line
## found on 'data/en_US/en_US.news.txt'
Parenthesis can be used to recreate set of matches. For example, if I want to repeat the same sub expressions without the need to re write it, then I can use the escape character \1 to match the same subexpression. For example the regex: +[aA-zZ]+ +\1 + will return repetitions. Since you are searching the first subexpression twice.
The * is “greedy” so it will always match the longest possible combination that satisfies the string you are searching. To make it less greedy and reduce the number of possible combinations you may add the ? after the * for example. s(.*?)s$. It will not return the complete sentence, but just a part of it.
The word love appears in 4 times more tweets than the word hate.
length(which(grepl(pattern = "love",x = tw)))/length(which(grepl(pattern = "hate",x = tw)))
grep(pattern = "biostats",x = tw,value=TRUE)
grep(pattern = "A computer once beat me at chess, but it was no match for me at kickboxing",x = tw,value=TRUE)
Even though the program suggests using the tm package, I’ve found that the newer tidytext package is faster and easier to understand since it uses the same principles used in the tidyverse. If you know how to work with tidy format data, then you’re good to go with this package.
When running the tokenizer of both packages on the news data set, which is the smaller of the 3, it seems that the unnest_tokens functions from tidytext needs 84% less time to run the files. This is important, since the first time I tried to run the entire datasets from a custom functionc using mc_tokenizer it took 34 minutes running in parallel. The first graph shows that the time improvement was consistently better.
Running furr package and nested tibbles was something that I tried (the chunk is now commented after the for loop), but it didn’t worked since it took 36.75 seconds. So the comparison will be made only between the first two functions.
largo=30
time=matrix(nrow=largo,ncol=3)
for (i in 1:largo) {
time[i,1]=system.time({MC_tokenizer(news)})[1]
time[i,2]=system.time({unnest_tokens(tbl = tibble(news=news),input = "news",output ="word" )})[1]
}
time.graph=data.frame(mc.tokenizer=time[,1],tidytext=time[,2],try=1:largo) %>%gather(method,time,-try) %>% ggplot(aes(try,time,color=method))+geom_line(lwd=1)+
labs(title="Time needed to tokenize News Vector",
x="# of Iteration",
y="Time in seconds")
df.news=tibble(text=news) %>% mutate(index=row_number()) %>% unnest_tokens(word,text) %>% mutate(source=3)
df.tw=tibble(text=tw) %>% mutate(index=row_number()) %>% unnest_tokens(word,text) %>% mutate(source=1)
df.blog=tibble(text=blog) %>% mutate(index=row_number()) %>% unnest_tokens(word,text) %>% mutate(source=2)
# plan(multisession, workers = 10)
#
# system.time({
# data.frame(news=news) %>% mutate(index=row_number()) %>% nest(data=c(news)) %>%
# mutate(tokens = future_map(
# data,
# ~ unnest_tokens(., word, news)
# ))
# })
saveRDS(time.graph,"time_graph.RDS")
saveRDS(df.news,"df_news.RDS")
saveRDS(df.tw,"df_tw.RDS")
saveRDS(df.blog,"df_blog.RDS")
time.graph=readRDS("time_graph.RDS")
time.graph
We are dealing with “big” datasets here. So you either need to create samples and use inference (to optimize the computing power and time of your model), or have a powerful computer. Here are some descriptive values of the dataset.
df.news=readRDS("df_news.RDS")
df.tw=readRDS("df_tw.RDS")
df.blog=readRDS("df_blog.RDS")
tw.size=object.size(tw)
blog.size=object.size(blog)
news.size=object.size(news)
data.frame(Source=c("Twitter","News","Blogs"),
Length=c(length(tw),length(news),length(blog)),
Size=c(tw.size,
news.size,
blog.size),
Tokens=c(nrow(df.tw),nrow(df.news),nrow(df.blog))) %>%
gt() %>%
fmt_bytes(columns = c(Size)) %>%
fmt_number(c(Length,Tokens),decimals = 0) %>%
tab_header(title="Summary of Data Objects",subtitle="By Source")%>%
tab_options(
column_labels.background.color = "Lightblue",
heading.background.color = "DarkBlue")
| Summary of Data Objects | |||
|---|---|---|---|
| By Source | |||
| Source | Length | Size | Tokens |
| 2,360,148 | 334.5 MB | 30,218,125 | |
| News | 77,259 | 20.7 MB | 2,693,898 |
| Blogs | 899,288 | 267.8 MB | 38,154,238 |
Here we show that blogs are on average have the longest number of words per entry. Although, news appear to use the most complex words since on average use the longest.
df.en=rbind(df.news,df.blog,df.tw) %>% mutate(source=as.factor(source))
levels(df.en$source)=c("Twitter","Blog","News")
rm(list = c("df.news","df.blog","df.tw","tw","news","blog"))
gc()
summary.df=df.en %>% group_by(source,index) %>% summarise(avg.letters=mean(nchar(word)),
tokens=n(),
tot.letters=sum(nchar(word)))
tot.summary=summary.df %>% group_by(source) %>% summarise(avg.tokens=mean(tokens),
avg.letter=mean(tot.letters/tokens),
entries=max(index))
saveRDS(tot.summary,"tot_summary_words.RDS")
saveRDS(df.en,"words.RDS")
tot.summary=readRDS("tot_summary_words.RDS")
tot.summary %>% ggplot(aes(avg.tokens,avg.letter,color=source))+geom_point(aes(size=entries))+
scale_size_area(max_size=20)+geom_label(aes(label=source),nudge_y = 0.05,nudge_x = 0.5) +
labs(title="EDA on data",subtitle="By Data source",x="Average words (tokens) per entry (rows)",y="Average Length of each word per entry")
The most commong words seem that repeat over the 3 data sources. Although some words seem to appear more on some platforms that in others, such as the news. It uses words such as “said”,“he”,“his”“from”, which are used when talking in third person. Something that news do quite often.
plot.common=df.en %>% group_by(source) %>%
count(word, sort = TRUE) %>%
slice_max(n,n=20) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word)) +
geom_col() +
labs(y = NULL)+facet_wrap(source~.,scales = "free_x")+
labs(title="Top 10 Most Common Words",subtitle="By Data source")
saveRDS(plot.common,"plot_common.RDS")
plot.common=readRDS("plot_common.RDS")
plot.common
Here is the evidence that the average tokens per entry and the average letters that each token has correspond to different distributions. If we would like to classify or distinguish between types of writing per app, we could do that.
i=1
df=list()
nested.df=df.en %>% group_by(source,index) %>% nest()
while (i<=1000) {
index= sample(1:nrow(nested.df),size = 10000,replace=TRUE)
df[[i]]=nested.df[index,] %>% unnest(cols = c(data)) %>% group_by(source,index) %>% summarise(avg.letters=mean(nchar(word)),
tokens=n(),
tot.letters=sum(nchar(word))) %>%
ungroup() %>% group_by(source) %>% summarise(avg.tokens=mean(tokens),
avg.letter=mean(tot.letters/tokens))
i=i+1
}
list.summary=bind_rows(df)
saveRDS(list.summary,"boostrapped summaries.RDS")
rm(list = c("nested.df"))
list.summary=readRDS("boostrapped summaries.RDS")
list.summary %>% ggplot(aes(x=avg.tokens,fill=source))+geom_density(alpha=0.5) + labs(title = "Number of tokens per entry",subtitle="Boostrapped 1000 times with sample size= 10,000")
list.summary %>% ggplot(aes(x=avg.letter,fill=source))+geom_density(alpha=0.5) + labs(title = "Average Letters per Word",subtitle="Boostrapped 1000 times with sample size= 10,000")
Now we are going to take a look into bigrams. The size of the dataframe will increase, as we are going to see the combinations of words, not just 2 separate tokens.
tw=readLines(con = "data/en_US/en_US.twitter.txt")
blog=readLines(con = "data/en_US/en_US.blogs.txt")
news=readLines(con = "data/en_US/en_US.news.txt")
df.news=tibble(text=news) %>% mutate(index=row_number()) %>% unnest_tokens(word,text, token = "ngrams", n = 2) %>% mutate(source=3)
df.tw=tibble(text=tw) %>% mutate(index=row_number()) %>% unnest_tokens(word,text, token = "ngrams", n = 2) %>% mutate(source=1)
df.blog=tibble(text=blog) %>% mutate(index=row_number()) %>% unnest_tokens(word,text, token = "ngrams", n = 2) %>% mutate(source=2)
df.en=rbind(df.news,df.blog,df.tw) %>% mutate(source=as.factor(source))
levels(df.en$source)=c("Twitter","Blog","News")
rm(list = c("df.news","df.blog","df.tw","tw","news","blog"))
gc()
saveRDS(df.en,"bigrams.RDS")
In the and of the seems to be the most commong bi grams among the 2 sources. Specially in blogs and news. Nevertheless, we see that the amount of commong “tokens” is now reduced, if we compared with the words in our previous graphs.
common bi-grams
In twitter the most common Bi-gram that starts with “I” and is followed by a space is I love, but that doesn’t hold true for blogs and news. We see, that the verbs to be “I was” and “I Am” are mostly used.
common bi-grams starting with “I”
We need to optimize our model in terms of the RAM needed to run. So we are going to look after Pareto’s Principle. Meaning, we are looking to keep the n-grams that appear the most in order to cover up 80% of all possible n-grams in our dataset. Usually, that would leave us with around 20% of the observations according to Pareto’s Principle.
As we see, with Twitter and Blog entries we can accomplish even a higher degree of optimization. With only 13.4% of the Twitter entries we cover 80% of all recurrent bi-grams and with 11.5% of the Blog entries we reach 80% of recurring bi grams.
In the case of news, the complexity pattern continues and we see that with 50% of the entries we would reach 80% of recurring bi-grams.
Since news is the type of data that differs the most in terms of complexity and similarity, we are going to leave it out of the equation as of now.
l=100000
plot.common.bigrams=df.en %>% group_by(source) %>%
count(word, sort = TRUE)
plot.common.bigrams=plot.common.bigrams %>% mutate(prop=n/sum(n),
prop.cum=cumsum(prop),
pareto=if_else(prop.cum<=0.8,TRUE,FALSE))
pareto.table=plot.common.bigrams %>% group_by(source,pareto) %>%
summarise(entries=n()) %>% mutate(prop=entries/sum(entries))
saveRDS(plot.common.bigrams,file = "bigrams_freq.RDS")
saveRDS(pareto.table,file = "pareto_bigrams.RDS")
pareto.table=readRDS("pareto_bigrams.RDS")
pareto.table %>% filter(pareto==TRUE) %>% select(-pareto) %>% ungroup %>% gt() %>% fmt_percent(columns = prop) %>%
fmt_number(entries,decimals = 0) %>% tab_header(title="Pareto's Principle",subtitle = "Number of bi-grams to cover 80% of all combinations") %>%
cols_label(
prop = "Proportion of total",
entries = "# of Bi-Grams",
source="Source"
) %>%
tab_style(
style=list(cell_fill("Darkblue"),cell_text(color="white",align="center")),
locations=cells_column_labels()
) %>%
tab_style(
style=list(cell_fill("Lightgreen"),cell_text(color="DarkBlue")),
locations=cells_body(columns = c('prop'),
rows=prop<=0.2)
) %>%
tab_style(
style=list(cell_text(align="center")),
locations=cells_body()
) %>%
tab_footnote(footnote = "Green Cells = sources with proportion <= to 20%",locations = cells_column_labels(columns = c('prop')))
| Pareto's Principle | ||
|---|---|---|
| Number of bi-grams to cover 80% of all combinations | ||
| Source | # of Bi-Grams | Proportion of total1 |
| 719,844 | 13.43% | |
| Blog | 748,394 | 11.46% |
| News | 479,265 | 47.80% |
|
1
Green Cells = sources with proportion <= to 20%
|
||
Ungrouping the variables to help identify common bi-grams without caring about the sources help to further the reduce the number of entries needed to reach 80% of all bi-grams by 546,814 entries. That is almost a reduction of -37%.
plot.common.bigrams=plot.common.bigrams %>% filter(source!="News") %>% ungroup() %>%
group_by(word) %>% summarise(n=sum(n)) %>% arrange(-n)
plot.common.bigrams=plot.common.bigrams %>%
mutate(prop=n/sum(n),
prop.cum=cumsum(prop),
pareto=if_else(prop.cum<=0.8,TRUE,FALSE))
pareto.table2=plot.common.bigrams %>% group_by(pareto) %>%
summarise(entries=n()) %>% mutate(prop=entries/sum(entries))
saveRDS(plot.common.bigrams,file = "bigrams_freq_general.RDS")
saveRDS(pareto.table2,file = "pareto_bigrams_general.RDS")
pareto.table2=readRDS("pareto_bigrams_general.RDS")
pareto.table2 %>% filter(pareto==TRUE) %>% select(-pareto) %>% ungroup %>% gt() %>% fmt_percent(columns = prop) %>%
fmt_number(entries,decimals = 0) %>% tab_header(title="Pareto's Principle",subtitle = "Number of bi-grams to cover 80% of all combinations") %>%
cols_label(
prop = "Proportion of total",
entries = "# of Bi-Grams"
) %>%
tab_style(
style=list(cell_fill("Darkblue"),cell_text(color="white",align="center")),
locations=cells_column_labels()
) %>%
tab_style(
style=list(cell_fill("Lightgreen"),cell_text(color="DarkBlue")),
locations=cells_body(columns = c('prop'),
rows=prop<=0.2)
) %>%
tab_style(
style=list(cell_text(align="center")),
locations=cells_body()
) %>%
tab_footnote(footnote = "Green Cells = sources with proportion <= to 20%",locations = cells_column_labels(columns = c('prop')))
| Pareto's Principle | |
|---|---|
| Number of bi-grams to cover 80% of all combinations | |
| # of Bi-Grams | Proportion of total1 |
| 921,424 | 8.88% |
|
1
Green Cells = sources with proportion <= to 20%
|
|
We still optimize by finding the point were the marginal increase is negative (or at least approaches it). We can see the points in the plot. We are going to choose the more conservative one.
plot.common.bigrams=readRDS("bigrams_freq_general.RDS")
plot.common.bigrams=plot.common.bigrams %>% filter(pareto==TRUE) %>% mutate(rank=row_number())
curve=plot.common.bigrams %>% distinct_at(vars(n,prop.cum),.keep_all = T) %>% mutate(primera.d=(prop.cum-lag(prop.cum,1))/(n-lag(n,1)),
segunda.d=primera.d-lag(primera.d,1))
minimo=curve %>% filter(n>100,n<40000,!is.infinite(segunda.d)) %>% slice_min(segunda.d,n = 1)
curve %>% ggplot(aes(n,prop.cum))+geom_line(lwd=1.5,color="blue")+
labs(title="Cummulative proportion per most common bi-gram")+geom_point(data = minimo,size=5,color="red")+
coord_flip()
# plot.common.bigrams=plot.common.bigrams %>% filter(rank<=minimo$rank)
# df.s=plot.common.bigrams %>% filter(pareto==TRUE) %>%
# separate(word,into=c("t","t.1"),sep = " ") %>% arrange(-n) %>%
# group_by(t,t.1) %>% group_by(t) %>%
# mutate(prop=n/sum(n)) %>% ungroup() %>% filter(complete.cases(.))
#
# library(data.table)
# df.s=as.data.table(df.s)
# saveRDS(df.s,"bi-gram matrix.RDS")
df.s=readRDS("bi-gram matrix.RDS")
letra="I"
df.s[t==tolower(word(letra,-1)),] %>% slice_max(prop,n=3) %>% pull(t.1)
## [1] "have" "was" "am"
letra="are"
df.s[t==tolower(word(letra,-1)),] %>% slice_max(prop,n=3) %>% pull(t.1)
## [1] "you" "the" "not"
Now we are going to take a look into tri-grams The size of the dataframe will increase, as we are going to see the combinations of words, not just 2 separate tokens.
tw=readLines(con = "data/en_US/en_US.twitter.txt")
blog=readLines(con = "data/en_US/en_US.blogs.txt")
news=readLines(con = "data/en_US/en_US.news.txt")
df.news=tibble(text=news) %>% mutate(index=row_number()) %>% unnest_tokens(word,text, token = "ngrams", n = 3) %>% mutate(source=3)
df.tw=tibble(text=tw) %>% mutate(index=row_number()) %>% unnest_tokens(word,text, token = "ngrams", n = 3) %>% mutate(source=1)
df.blog=tibble(text=blog) %>% mutate(index=row_number()) %>% unnest_tokens(word,text, token = "ngrams", n = 3) %>% mutate(source=2)
df.en=rbind(df.news,df.blog,df.tw) %>% mutate(source=as.factor(source))
levels(df.en$source)=c("Twitter","Blog","News")
rm(list = c("df.news","df.blog","df.tw","tw","news","blog"))
gc()
saveRDS(df.en,"trigrams.RDS")
In this case we see a lot of NA’s which werent present before. There are probably tweets with less than 3 grams. We are going to remove NA’s.
common tri-grams starting with “I”