- 研究動機
- 資料介紹
- 研究問題
- 分析
- 結論
2018年6月4日
網路的興起,人與人間的互動不再侷限於面對面的談話,許多在現實世界有交友困難或想尋覓另一半者,開始轉向網路交友,希望在網路世界中能夠補足現實生活的不足,而開始使用各種網路交友軟體。然而使用交友軟體的人不在少數,因此研究者想藉由OkCupid用戶數據了解現今用戶使用狀況。
59,946名OkCupid用戶的公開個人資料
數據由豐富的類別、順序、數字和文字變項組成
問題包括:用戶訊息(性別、性取向、年齡、職業和種族);生活方式(飲食習慣、飲酒習慣、吸菸習慣),以及10個簡答題(自介、人生規劃、擅長的事、個性、生活品味、價值觀、著迷於、理想的生活、我的秘密及我在尋找的你是)
資料整理 r Load data profiles <- read.csv(file="profiles.csv", stringsAsFactors=FALSE) profiles <- as.data.frame(profiles) profiles <- na.omit(profiles)r
replace blanks with NA library(stringr) profiles[] <- lapply(profiles, str_trim) is.na(profiles) <- profiles=='' extract only complete user data comp.profiles=profiles[complete.cases(profiles), ]
str(comp.profiles) class(comp.profiles) 變更essay名稱 names(comp.profiles)[names(comp.profiles)=="essay0"] = "self.intro" names(comp.profiles)[names(comp.profiles)=="essay1"] = "life.plan" names(comp.profiles)[names(comp.profiles)=="essay2"] = "talented" names(comp.profiles)[names(comp.profiles)=="essay3"] = "personality" names(comp.profiles)[names(comp.profiles)=="essay4"] = "lifestyle.taste" names(comp.profiles)[names(comp.profiles)=="essay5"] = "view.of.life" names(comp.profiles)[names(comp.profiles)=="essay6"] = "fascinated.by" names(comp.profiles)[names(comp.profiles)=="essay7"] = "ideal.life" names(comp.profiles)[names(comp.profiles)=="essay8"] = "my.secrete" names(comp.profiles)[names(comp.profiles)=="essay9"] = "what.turn.me.on"
男女的職業分布圖
A function that reorder the x variable when graphing
reorder_size <- function(x) {
factor(x, levels = names(sort(table(x))))}
prof.graph <- ggplot(na.omit(profiles),
aes(reorder_size(job), fill = sex))
prof.graph + geom_bar() + scale_fill_manual
(values = c("rosybrown2", "skyblue2")) +
geom_text(stat='count', aes(label=..count..),
size = 3.3, angle = 10,
position = position_stack(vjust = 0.5),check_overlap = T)+
labs(title = "Job Positions by Gender",
y = "User count",
x = "Job position")+ coord_flip() +
theme(legend.position = "right",
axis.text.x = element_text(angle = 45, hjust = 1))
男女的教育分布圖
library(forcats)
profiles[, "education"] <- as.factor(profiles[, "education"])
profiles$education <- fct_collapse(profiles$education, other =
c("college/university","dropped out of high school",
"dropped out of law school" ,"dropped out of masters program",
"dropped out of med school","dropped out of ph.d program", "dropped out of two-year college" ,"graduated from law school",
"graduated from med school" ,"graduated from space camp",
"high school","law school" ,"masters program",
"med school","ph.d program" ,"space camp",
"two-year college" ,"working on high school",
"working on law school" ,"working on masters program",
"working on med school" ,"working on ph.d program",
"working on space camp"))
edu.graph <- ggplot(na.omit(profiles),
aes(reorder_size(education), fill = sex))
edu.graph + geom_bar() +
geom_text(stat='count', aes(label=..count..),
size = 3.3, angle = 10, position = position_stack(vjust = 0.5),
check_overlap = T)+
scale_fill_manual(values = c("rosybrown2", "skyblue2")) +
labs(title = "Education by Gender",
y = "User count (omit NA)",
x = "Education")+ coord_flip() +
theme(legend.position = "right",
axis.text.x = element_text(angle = 45, hjust = 1))
男女的收入分佈圖
profiles[, "income"] <- as.integer(profiles[, "income"]) fav_stats(profiles[, "income"], profiles) profiles[, "income"] <- as.factor(profiles[, "income"])
由總數看呈現比例
prof <- filter(profiles, income == "20000"
| income == "30000"| income == "40000"
| income == "50000" | income == "60000"
| income == "70000" | income == "80000"
| income == "90000")
income.graph <- ggplot(prof, aes(income, fill = factor(sex)))
income.graph + geom_bar(aes(y = prop.table(..count..) * 100),
position = "dodge") + geom_text(stat='count',
aes(y = prop.table(..count..) * 100 + 0.5,
label = paste0(round(prop.table(..count..) * 100,
digits= 2), '%')),
size = 3.3)+
scale_fill_manual(values = c("rosybrown2", "skyblue2")) +
labs(title = "Income by Gender",
y = "percent",
x = "Income")+
theme(legend.position = "right",
axis.text.x = element_text(angle = 45, hjust = 1))
由男女性別中比較收入級距比例
ggplot(prof %>%
dplyr::group_by(sex) %>%
#change this to gender if that's the intended denominator
dplyr::mutate(w = 1/n()) %>% ungroup()) +
aes(income, fill = sex, weight = w)+
geom_bar(aes(x = income, fill = sex), position = "dodge", group=1)+
geom_text(stat='count', check_overlap = TRUE,
aes(label = paste0(round(prop.table(..count..) * 100*2,
digits= 2), '%')), size = 3.3)+
scale_fill_manual(values = c("rosybrown2", "skyblue2")) +
scale_y_continuous(name = "percent", labels = scales::percent,
limits = c(0,0.4))+ labs(title = "Compare income Ratio within Gender",
y = "Density", x = "Income")+
theme(legend.position = "right",
axis.text.x = element_text(angle = 45, hjust = 1))
男女的性傾向
ori.graph <- ggplot(profiles, aes(orientation, fill = sex))
ori.graph + geom_bar() +
geom_text(stat='count', aes(label=..count..),
size = 3.3, angle = 10, position = position_stack(vjust = 0.5),
check_overlap = T)+
scale_fill_manual(values = c("rosybrown2", "skyblue2")) +
facet_wrap(~ sex)+
labs(title = "Orientation by Gender",
y = "User count",
x = "Orientation")+
theme(legend.position = "right",
axis.text.x = element_text(angle = 45, hjust = 1))
以百分比呈現
ori.graph <- ggplot(profiles, aes(factor(orientation),
fill = factor(sex)))
ori.graph + geom_bar(aes(y = prop.table(..count..) * 100)) +
geom_text(stat='count',
aes(y = prop.table(..count..) * 100 + 0.5,
label = paste0(round(prop.table(..count..) * 100,
digits= 2), '%')), size = 3.3,
position = position_stack(vjust = 0.5))+
scale_fill_manual(values = c("rosybrown2", "skyblue2")) +
facet_wrap(~ sex)+
labs(title = "Orientation by Gender",
y = "% of User",
x = "Orientation")+
theme(legend.position = "right",
axis.text.x = element_text(angle = 45, hjust = 1))
男女的身高 To investigate numerical summaries using favstats() from the mosaic package
profiles[, "height"] <- as.numeric(profiles[, "height"])
fav_stats(profiles[, "height"], profiles)
profiles.subset <- filter(profiles, height>=55 & height <=80)
height.graph <- ggplot(profiles.subset, aes(height, fill = sex))+
geom_histogram(binwidth = 1, aes(y=..density..), color = "grey30") +
stat_density(geom="line", adjust = 5)+
scale_fill_manual(values = c("rosybrown2", "skyblue2")) +
facet_grid(sex ~ .)+
labs(title = "Height histogram by Gender",
y = "Density",
x = "Height in inches")+
theme(legend.position = "right",
axis.text.x = element_text(angle = 45, hjust = 1))
#install.packages("ggpubr")
library(ggpubr)
Compute descriptive statistics by groups
stable <- desc_statby(profiles, measure.var = "height",
grps = "sex")
stable <- stable[, c("sex", "length", "mean", "sd")]
Summary table plot, medium orange theme
stable.p <- ggtexttable(stable, rows = NULL,
theme = ttheme("default"))
ggarrange(stable.p, height.graph,
ncol = 1, nrow = 2,
heights = c(0.3, 1))
Text analysis
library(dplyr) library(stringr) library(lattice) essays <- select(comp.profiles, select=colnames(comp.profiles[,7:16])) essays <- apply(essays, MARGIN=1, FUN=paste, collapse=" ") essays <- str_replace_all(essays, "\n", " ") essays <- str_replace_all(essays, "<br />", " ")
Do male and female OkCupid users use words at different rates in their essayresponses?
comp.profiles$has.book <- str_detect(essays, "book") tally(has.book ~ sex, comp.profiles, format='proportion') we make similar comparisons for the use of the words “travel,” “food,” “wine,”and “beer.” comp.profiles$has.wine <- str_detect(essays, "wine") a <- as.data.frame(tally(has.wine ~ sex, comp.profiles, format='proportion')) %>% spread(key=has.wine, value=round(Freq, digits = 2)) comp.profiles$has.travel <- str_detect(essays, "travel") b <- as.data.frame(tally(has.travel ~ sex, comp.profiles, format='proportion')) %>% spread(key=has.travel, value=Freq) comp.profiles$has.food <- str_detect(essays, "food") c <- as.data.frame(tally(has.food ~ sex, comp.profiles, format='proportion')) %>% spread(key=has.food, value=Freq)
Combine them and plot the table
join_test <- full_join(a, b, by.x = "sex", by.y = "TRUE")
%>% full_join(c, by.x = "sex", by.y = "TRUE")
join_test$word <- c("wine","wine","travel",
"travel","food","food")
join_test <- join_test[c("word", "sex", "TRUE","FALSE")]
join_test %>% unlist() %>% as.numeric(join_test)
join_test <- format(join_test, digits = 2)
word_table <- ggtexttable(join_test, rows = NULL,
theme = ttheme("default"))
We can also evaluate the statistical significance of the difference in the use of the words
comp.profiles$has.car <- str_detect(essays, "car") results <- tally(~ has.car + sex, data=comp.profiles) prop.test(x=results[1, ], n=colSums(results), alternative="two.sided")
generate the top 500 words used by males and females respectively
male.words <- subset(essays, comp.profiles$sex == "m") %>%
str_split(" ") %>%
unlist() %>%
table() %>%
sort(decreasing=TRUE) %>%
names()
female.words <- subset(essays, comp.profiles$sex == "f") %>%
str_split(" ") %>%
unlist() %>%
table() %>%
sort(decreasing=TRUE) %>%
names()
# Words in the males top 500 that weren't in the females' top 500:
setdiff(male.words[1:500], female.words[1:500])
# Words in the female top 500 that weren't in the males' top 500:
setdiff(female.words[1:500], male.words[1:500])
資料整理
#把okcupid user的回答選取起來並清理,
再儲存成corpus的data格式以便接下來token化和處理
text<-subset(comp.profiles, select=colnames(comp.profiles[,7:16])) %>%
lapply(function(txt) {
gsub("(\n|<br />)"," ",txt) %>%
gsub(' http[^[:blank:]]+', '', .)
})
text <- gsub("c(", "", text, fixed="TRUE")
text <- gsub(")", "", text, fixed="TRUE")
text <- trimws(text)
title <- c("self.intro","life.plan","talented",
"personality","lifestyle.taste",
"view.of.life","fascinated.by","ideal.life",
"my.secrete","what.turn.me.on")
dta <- corpus_frame(title, text)
常用詞組圖表
library(tidyr)
library(tidytext)
dtaa <- as_tibble(dta)
dtaa$text <- as.character(dtaa$text)
#head(dtaa)
dtaa<-dtaa %>% unnest_tokens(bigram, text,
token = "ngrams", n = 2)
bigrams_separated <- dtaa %>%
separate(bigram, c("word1", "word2"), sep = " ")
bigrams_filtered <- bigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
bigrams_united <- bigrams_filtered %>%
unite(bigram, word1, word2, sep = " ")
bigrams_united
dtaa %>%
unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word,
!word3 %in% stop_words$word) %>%
dplyr::count(word1, word2, word3, sort = TRUE)
head(dtaa)
bigrams_filtered %>%
filter(word2 == "pretty") %>%
count(title, word1, sort = TRUE)
bigram_tf_idf <- bigrams_united %>%
count(title, bigram) %>%
bind_tf_idf(bigram, title, n) %>%
arrange(desc(tf_idf))
bigram_tf_idf
library(ggplot2) library(dplyr) bigram_tf_idf %>% arrange(desc(tf_idf)) %>% mutate(word = factor(bigram, levels = rev(unique(bigram)))) %>% group_by(title) %>% top_n(8) %>% ungroup %>% ggplot(aes(bigram, tf_idf, fill = title)) + geom_col(show.legend = FALSE) + labs(x = NULL, y = "tf-idf") + facet_wrap(~title, ncol = 2, scales = "free") + coord_flip()
字詞矩陣頻率
dtaa <- as_tibble(dta)
dtaa$text <- as.character(dtaa$text)
library(stringr)
cleaned_text <- dtaa %>%
group_by(title)
head(cleaned_text)
cleaned_text <- cleaned_text %>%
filter(str_detect(text, "^[^>]+[A-Za-z\\d]") | text == "",
!str_detect(text, "writes(:|\\.\\.\\.)$"),
!text %in% stop_words$word)
library(tidytext)
usenet_words <- cleaned_text %>%
unnest_tokens(word, text)
usenet_words %>%
dplyr::count(word, sort = TRUE)
usenet_words %>%
count(word, sort = TRUE) %>%
filter(!word %in% stop_words$word)
words_by_title <- usenet_words %>%
dplyr::count(title, word, sort = TRUE) %>%
filter(!word %in% stop_words$word) %>%
ungroup()
words_by_title
tf_idf <- words_by_title %>%
bind_tf_idf(word, title, n) %>%
filter(!word %in% stop_words$word) %>%
arrange(desc(tf_idf))
tf_idf
tf_idf %>%
group_by(title) %>%
top_n(12, tf_idf) %>%
filter(!word %in% stop_words$word) %>%
ungroup() %>%
mutate(word = reorder(word, tf_idf)) %>%
ggplot(aes(word, tf_idf, fill = title)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ title, scales = "free") +
ylab("tf-idf") +
coord_flip()
library(widyr)
title_cors <- words_by_title %>% filter(!word %in% stop_words$word) %>%
pairwise_cor(title, word, n, sort = TRUE)
title_cors
install.packages("ggraph")
library(ggraph)
install.packages("igraph")
The igraph package has many powerful functions
for manipulating and analyzing networks.
library(igraph)
set.seed(2018)
title_cors %>%
filter(correlation > .4) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(alpha = correlation, width = correlation)) +
geom_node_point(size = 6, color = "lightblue") +
geom_node_text(aes(label = name), repel = TRUE) +
theme_void()
Sentiment Analysis
We use the AFINN sentiment lexicon, which provides numeric positivity scores for each word, and visualize it with a bar plot
contributions <- usenet_words %>%
dplyr::inner_join(get_sentiments("afinn"), by = "word") %>%
dplyr::group_by(word) %>%
dplyr::summarize(occurences = n(),
contribution = sum(score))
contributions %>%
top_n(25, abs(contribution)) %>%
mutate(word = reorder(word, contribution)) %>%
ggplot(aes(word, contribution, fill = contribution > 0)) +
geom_col(show.legend = FALSE) +
coord_flip()
top_sentiment_words <- words_by_title %>%
inner_join(get_sentiments("afinn"), by = "word") %>%
mutate(contribution = score * n / sum(n))
top_sentiment_words
top_sentiment_words %>%
group_by(title) %>%
top_n(15, abs(contribution)) %>%
ungroup() %>%
mutate(word = reorder(word, contribution)) %>%
ggplot(aes(word, contribution, fill = contribution > 0)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ title, scales = "free") +
ylab("essays") +
coord_flip()
基於以上畫圖呈現,發現大家在essay問題中使用負向情緒的情況較少 故這裡進行負向情緒比例的分析
bingnegative <- get_sentiments("bing") %>%
filter(sentiment == "negative")
str(usenet_words)
wordcounts <- usenet_words %>%
dplyr::group_by(title) %>%
dplyr::summarize(words = n())
neg_ratio <- usenet_words %>%
dplyr::semi_join(bingnegative) %>%
dplyr::group_by(title) %>%
dplyr::summarize(negativewords = n()) %>%
dplyr::left_join(wordcounts, by = c("title")) %>%
dplyr::mutate(ratio = negativewords/words) %>%
dplyr::filter(title != 0) %>%
top_n(10) %>% arrange(desc(ratio)) %>% ungroup()
neg_ratio.p <- ggtexttable(neg_ratio, rows = NULL,
theme = ttheme("default"))
male count
m.text <- as.character(comp.profiles$life.plan)
m <- comp.profiles$sex
m_text <- cbind(m.text, m) %>% subset( m == "m") %>%
as.data.frame()
maletext <- as.character(m_text$m.text)
sample <- sample(maletext, (length(maletext)))
corpus <- Corpus(VectorSource(list(sample)))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeWords, stopwords('english'))
corpus <- tm_map(corpus, stemDocument)
dtm_up <- DocumentTermMatrix(VCorpus(VectorSource(corpus[[1]]$content)))
freq_up <- colSums(as.matrix(dtm_up))
head(freq_up)
sentiments_up <- cbind(sentiments_up, as.data.frame(freq_up))
#head(sentiments_up)
#tail(sentiments_up)
sent_pos_up <- sentiments_up[sentiments_up$sentiment=='Positive',]
sent_neg_up <- sentiments_up[sentiments_up$sentiment=='Negative',]
cat("Negative sentiments: ",sum(sent_neg_up$freq_up),
" Positive sentiments: ",sum(sent_pos_up$freq_up))
wordcloud(sent_pos_up$text,sent_pos_up$freq,min.freq=5,
random.order=FALSE,colors=brewer.pal(6,"Dark2"))
wordcloud(sent_neg_up$text,sent_neg_up$freq,min.freq=5,
random.order=FALSE,colors=brewer.pal(6,"Dark2"))
female count
f.text <- as.character(comp.profiles$life.plan)
f <- comp.profiles$sex
f_text <- cbind(f.text, f) %>% subset( f == "f") %>%
as.data.frame()
femaletext <- as.character(f_text$f.text)
sample <- sample(femaletext, (length(femaletext)))
corpus <- Corpus(VectorSource(list(sample)))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeWords, stopwords('english'))
corpus <- tm_map(corpus, stemDocument)
dtm_up <- DocumentTermMatrix(VCorpus(VectorSource(corpus[[1]]$content)))
freq_up <- colSums(as.matrix(dtm_up))
sentiments_up <- cbind(sentiments_up, as.data.frame(freq_up))
sent_pos_up <- sentiments_up[sentiments_up$sentiment=='Positive',]
sent_neg_up <- sentiments_up[sentiments_up$sentiment=='Negative',]
cat("Negative sentiments: ",sum(sent_neg_up$freq_up),
" Positive sentiments: ",sum(sent_pos_up$freq_up))
wordcloud(sent_pos_up$text,sent_pos_up$freq,min.freq=5,
random.order=FALSE,colors=brewer.pal(6,"Dark2"))
wordcloud(sent_neg_up$text,sent_neg_up$freq,min.freq=5,
random.order=FALSE,colors=brewer.pal(6,"Dark2"))
text<-subset(comp.profiles,
select=colnames(comp.profiles[,7:16])) %>%
lapply(function(txt) {
gsub("(\n|<br />)"," ",txt) %>%
gsub(' http[^[:blank:]]+', '', .)})
text <- gsub("c(", "", text, fixed="TRUE")
text <- gsub(")", "", text, fixed="TRUE")
text <- trimws(text)
title <- c("self.intro","life.plan","talented",
"personality","lifestyle.taste",
"view.of.life","fascinated.by","ideal.life",
"my.secrete","what.turn.me.on")
dta <- corpus_frame(title, text)
class(dta)
head(dta)
Tokenization
A text_filter object controls the rules for segmentation and normalization. text_filter(dta) text_filter(dta)$map_case <- FALSE text_filter(dta)$drop_punct <- TRUE text_tokens(dta[1,]) text_filter(dta) <- text_filter(drop_punct = TRUE)
The text_stats function computes all three
counts and presents the results in a data frame
text_ntoken(dta) text_nsentence(dta) stats <- text_stats(dta)
term_stats(dta)
term_stats(dta, subset = !term %in% stopwords_en)
The types argument allows us to request the component types in the result:
Here are the most common 2-, 3-grams starting with “love”,
where the second type is not a function word
term_stats(dta, ngrams = 2:3, types = TRUE, subset = type1 == "love" & !type2 %in% stopwords_en)
text_locate(dta, c("love", "dancing", "music"))
We can also request that the results be returned in random order,
using the text_sample() function.
this is useful for inspecting a random sample of the matches:
text_sample(dta, c("love", "dancing", "music"))
Other functions allow counting term occurrences,
testing for whether a term appears in a text,
and getting the subset of texts containing a term:
text_count(dta, "handsome") text_detect(dta, "handsome") text_subset(dta, "handsome")
We can combine text_split with text_count to measure the occurrences rates for the term “handsome”
Here, the chunks have varying sizes, so we look at the rates rather than the raw counts.
chunks <- text_split(dta, "tokens", 25000)
size <- text_ntoken(chunks)
unit <- 1000 # rate per 1000 tokens
count <- text_count(chunks, "handsome")
rate <- count / size * unit
i <- seq_along(rate)
plot(i, rate, type = "l", xlab = "Segment",
ylab = "Rate \u00d7 1000",
main = paste(dQuote("handsome"), "Occurrences"), col = 2)
points(i, rate, pch = 16, cex = 0.5, col = 2)
affect_wordnet
This lexicon classifies a large set of terms correlated
with emotional affect into four main categories:
“Positive”, “Negative”, “Ambiguous”, and “Neutral”
and a variety of sub-categories.
with(affect_wordnet, table(category, emotion))
Terms can appear in multiple categories,
or with multiple parts of speech.
some duplicate terms
subset(affect_wordnet, term %in% c("caring", "chill", "hopeful"))
For our final application,
we will track emotion word usage over OKcupid personal profiles.
We will do this by segmenting the corpus into small chunks,
and then measure the occurrence rates of
emotion words in these chunks.
First need a lexicon of emotion words.
We take as a starting point the WordNet-Affect lexicon,
but remove “Neutral” emotion words.
affect <- subset(affect_wordnet, emotion != "Neutral") affect$emotion <- droplevels(affect$emotion) #drop the unused "Neutral" level affect$category <- droplevels(affect$category) #drop unused categories
Rather than blindly applying the lexicon,
we first check to see what the most common emotion terms are.
term_stats(dta, subset = term %in% affect$term)
A few terms jump out as unusual:
“black” probably do not evoke emotions.
We inspect the usages of the most common terms using the text_locate function,which shows these terms in context.
text_sample(dta, "black")
It looks like “black” is used as not an emotion.
We will exclude it form the lexicon.
We can also inspect the first token after each appearance of “black”:
term_stats(text_sub(text_locate(dta, "black")$after, 1, 1))
Over half the time, “black” describes objects.It does not describe or evoke emotion. We should exclude it from the lexicon.
affect <- subset(affect, !term %in% c("black"))
Now that we have a lexicon, our plan is to segment the text into smaller chunks and compute the emotion occurrence rates in each chunk, broken down by category (“Positive”, “Negative”, or “Ambiguous”).
To facilitate the rate computations, we will form a term-by-emotion rate for the lexicon:
term_scores <- with(affect, unclass(table(term, emotion))) head(term_scores)
Here, term_scores is a matrix with entry (i,j) indicating the number of times that term i appeared in the affect lexicon with emotion j.
We re-classify any term appearing in two or more categories as ambiguous:
ncat <- rowSums(term_scores > 0)
term_scores[ncat > 1, c("Positive", "Negative", "Ambiguous")]
<- c(0, 0, 1)
At this point, every term is in one category, but the score for the term could be 2, 3, or more, depending on the number of sub-categories the term appeared in. We replace these larger values with one.
term_scores[term_scores > 1] <- 1
x <- term_matrix(chunks, select = rownames(term_scores))
text_scores <- x %*% term_scores
unit <- 1000
rate <- list(pos = text_scores[, "Positive"] / n * unit,
neg = text_scores[, "Negative"] / n * unit,
ambig = text_scores[, "Ambiguous"] / n * unit)
rate$total <- rate$pos + rate$neg + rate$ambig
se <- lapply(rate, function(r) sqrt(r * (unit - r) / n))
i <- seq_len(nrow(chunks))
par(mar = c(4, 4, 11, 9) + 0.1, las = 1)
xlim <- range(i - 0.5, i + 0.5)
ylim <- range(0, rate$total + se$total, rate$total - se$total)
plot(xlim, ylim, type = "n", xlab = "Segment",
ylab = "Rate \u00d7 1000", axes = FALSE, xaxs = "i")
usr <- par("usr") # get the user coordinates for later
axis(1, at = i[i %% 5 == 0], labels = FALSE)
axis(1, at = i[i %% 10 == 0], labels = TRUE)
axis(2)
abline(v = tapply(i, chunks$parent, min) - 0.5, col = "gray")
labels <- dta$title
at <- tapply(i, chunks$parent, mean)
text(at, usr[4] + 0.01 * diff(usr[3:4]),
labels = labels, adj = 0, srt = 45, cex = 0.8, xpd = TRUE)
box()
col <- c(total = "#000000", pos = "#FC8D62",
neg = "#8DA0CB", ambig = "#66C2A5")
legend(usr[2] + 0.015 * diff(usr[1:2]), usr[3] + 0.8 * diff(usr[3:4]),
legend = c("Total", "Positive", "Negative", "Ambiguous"),
title = expression(bold("Emotion")),
fill = col[c("total", "pos", "neg", "ambig")],
cex = 0.8, xpd = TRUE)
abline(h = mean(rate$total), lty = 2, col = col[["total"]])
for (t in c("ambig", "neg", "pos", "total")) {
r <- rate[[t]]
s <- se[[t]]
cl <- col[[t]]
lines(i, r, col = cl)
points(i, r, col = cl, pch = 16, cex = 0.5)
if (t == "total") {
int <- abs((r - mean(r)) / sd(r)) > 2
segments(i[int], (r - s)[int], i[int],
(r + s)[int], col = cl)
segments((i - .2)[int], (r - s)[int], (i + .2)[int],
(r - s)[int], col = cl)
segments((i - .2)[int], (r + s)[int], (i + .2)[int],
(r + s)[int], col = cl)
}
}
這次文字分析EDA部分可再改進,幾個原因:
1.時間壓力,因為資料前處理(cleaning, wrangling..)非常花時間
2.硬體條件,電腦不夠力常常跑到當機,也很花時間,只選用complete欄位的data,也是由於怕不好清理,以及資料太多
建議在實際分析text analysis時,能多細緻修data就多細
這樣再做更進階的分析,例如進行機器學習的training時,會影響其model的預測能力