options(scipen = 999)
#### Load libraries
library(ggplot2)
library(tidyverse)
library(tibble)
library(cowplot)
library(fitdistrplus)
library(readr)
library(readxl)
library(stringr)
library(ggmosaic)
library(tidytext)
library(prettydoc)
library(viridis)
#### Load dataset
okc_df <- read.csv("okcupid_profiles.csv")
okc_new <- okc_df
#### Need to rename the following columns:
# essay0- My self summary
# essay1- What I’m doing with my life
# essay2- I’m really good at
# essay3- The first thing people usually notice about me
# essay4- Favorite books, movies, show, music, and food
# essay5- The six things I could never do without
# essay6- I spend a lot of time thinking about
# essay7- On a typical Friday night I am
# essay8- The most private thing I am willing to admit
# essay9- You should message me if...
okc_new <- okc_new %>% rename(
self_summary = essay0,
doing_with_my_life = essay1,
good_at = essay2,
first_notice_abt_me = essay3,
fav_hobbies = essay4,
six_things = essay5,
think_abt = essay6,
friday_night_thing = essay7,
most_pvt_thing = essay8,
msg_me_if = essay9)
# Let's check the dataframe
glimpse(okc_new)## Rows: 59,946
## Columns: 31
## $ age <int> 22, 35, 38, 23, 29, 29, 32, 31, 24, 37, 35, 28, 24…
## $ status <chr> "single", "single", "available", "single", "single…
## $ sex <chr> "m", "m", "m", "m", "m", "m", "f", "f", "f", "m", …
## $ orientation <chr> "straight", "straight", "straight", "straight", "s…
## $ body_type <chr> "a little extra", "average", "thin", "thin", "athl…
## $ diet <chr> "strictly anything", "mostly other", "anything", "…
## $ drinks <chr> "socially", "often", "socially", "socially", "soci…
## $ drugs <chr> "never", "sometimes", "", "", "never", "", "never"…
## $ education <chr> "working on college/university", "working on space…
## $ ethnicity <chr> "asian, white", "white", "", "white", "asian, blac…
## $ height <dbl> 75, 70, 68, 71, 66, 67, 65, 65, 67, 65, 70, 72, 72…
## $ income <int> -1, 80000, -1, 20000, -1, -1, -1, -1, -1, -1, -1, …
## $ job <chr> "transportation", "hospitality / travel", "", "stu…
## $ last_online <chr> "2012-06-28-20-30", "2012-06-29-21-41", "2012-06-2…
## $ location <chr> "south san francisco, california", "oakland, calif…
## $ offspring <chr> "doesn't have kids, but might want them", "doesn't…
## $ pets <chr> "likes dogs and likes cats", "likes dogs and likes…
## $ religion <chr> "agnosticism and very serious about it", "agnostic…
## $ sign <chr> "gemini", "cancer", "pisces but it doesn’t m…
## $ smokes <chr> "sometimes", "no", "no", "no", "no", "no", "", "no…
## $ speaks <chr> "english", "english (fluently), spanish (poorly), …
## $ self_summary <chr> "about me: i would love to think that i was some …
## $ doing_with_my_life <chr> "currently working as an international agent for a…
## $ good_at <chr> "making people laugh. ranting about a good salting…
## $ first_notice_abt_me <chr> "the way i look. i am a six foot half asian, half …
## $ fav_hobbies <chr> "books: absurdistan, the republic, of mice and men…
## $ six_things <chr> "food. water. cell phone. shelter.", "delicious po…
## $ think_abt <chr> "duality and humorous things", "", "", "cats and g…
## $ friday_night_thing <chr> "trying to find someone to hang out with. i am dow…
## $ most_pvt_thing <chr> "i am new to california and looking for someone to…
## $ msg_me_if <chr> "you want to be swept off your feet! you are tired…
# Remove all the empty cells
okc_new <- okc_new %>% na_if("") %>% na.omit()
# Let's change some of the column structures
okc_new$status <- as.factor(okc_new$status)
okc_new$sex <- as.factor(okc_new$sex)
okc_new$orientation <- as.factor(okc_new$orientation)
okc_new$body_type <- as.factor(okc_new$body_type)
okc_new$diet <- as.factor(okc_new$diet)
okc_new$drinks <- as.factor(okc_new$drinks)
okc_new$drugs <- as.factor(okc_new$drugs)
okc_new$education <- as.factor(okc_new$education)
okc_new$ethnicity <- as.factor(okc_new$ethnicity)
okc_new$job <- as.factor(okc_new$job)
okc_new$offspring <- as.factor(okc_new$offspring)
okc_new$pets <- as.factor(okc_new$pets)
okc_new$religion <- as.factor(okc_new$religion)
okc_new$sign <- as.factor(okc_new$sign)
okc_new$smokes <- as.factor(okc_new$smoke)
# Let's recheck the dataframe
glimpse(okc_new)## Rows: 4,407
## Columns: 31
## $ age <int> 22, 33, 30, 29, 31, 45, 23, 50, 21, 50, 26, 26, 27…
## $ status <fct> single, single, single, single, single, single, si…
## $ sex <fct> m, m, m, m, f, f, f, m, m, m, f, f, m, m, m, m, f,…
## $ orientation <fct> straight, straight, straight, straight, straight, …
## $ body_type <fct> a little extra, athletic, fit, fit, curvy, fit, cu…
## $ diet <fct> strictly anything, mostly anything, mostly anythin…
## $ drinks <fct> socially, socially, socially, socially, socially, …
## $ drugs <fct> never, never, never, sometimes, sometimes, never, …
## $ education <fct> working on college/university, graduated from mast…
## $ ethnicity <fct> "asian, white", "white", "white", "white", "white"…
## $ height <dbl> 75, 72, 69, 67, 66, 64, 66, 75, 70, 73, 63, 68, 68…
## $ income <int> -1, -1, -1, 40000, -1, -1, 20000, -1, 1000000, 800…
## $ job <fct> transportation, science / tech / engineering, exec…
## $ last_online <chr> "2012-06-28-20-30", "2012-06-27-21-41", "2012-06-2…
## $ location <chr> "south san francisco, california", "san francisco,…
## $ offspring <fct> "doesn't have kids, but might want them", "doesn't…
## $ pets <fct> likes dogs and likes cats, likes dogs and likes ca…
## $ religion <fct> agnosticism and very serious about it, catholicism…
## $ sign <fct> gemini, pisces and it’s fun to think about, …
## $ smokes <fct> sometimes, no, no, no, no, no, when drinking, no, …
## $ speaks <chr> "english", "english (fluently)", "english, spanish…
## $ self_summary <chr> "about me: i would love to think that i was some …
## $ doing_with_my_life <chr> "currently working as an international agent for a…
## $ good_at <chr> "making people laugh. ranting about a good salting…
## $ first_notice_abt_me <chr> "the way i look. i am a six foot half asian, half …
## $ fav_hobbies <chr> "books: absurdistan, the republic, of mice and men…
## $ six_things <chr> "food. water. cell phone. shelter.", "juicy fruit,…
## $ think_abt <chr> "duality and humorous things", "my passions and se…
## $ friday_night_thing <chr> "trying to find someone to hang out with. i am dow…
## $ most_pvt_thing <chr> "i am new to california and looking for someone to…
## $ msg_me_if <chr> "you want to be swept off your feet! you are tired…
# Looks like the entry before the comma in the Location Column is the name of the city.
okc_new <- okc_new %>% separate(location, c("City", "State"), sep = ",")
okc_new$City <- as.factor(okc_new$City)
okc_new$State <- as.factor(okc_new$State)
# Looks like there are some negative values for income. Let's get rid of them
okc_new <- okc_new %>% filter(income >= 0)
#### Excercise: Find out what proportion of the dataset did not report their income
#### Let's look at the education levels
okc_new %>% dplyr::select(education) %>% distinct()# Stop words
stopWords <- stop_words %>% rename(Words = word)Let’s classify education in the following categories:
Post Graduate Degree
University Degree
High School
Unknown
#### Create vector Edu_Lvl
okc_new <- okc_new %>%
mutate(edu_lvl = case_when(str_detect(education, "high") ~ 'High_School',
str_detect(education, 'college|university"') ~ 'College',
str_detect(education, 'ph|masters|law|med') ~ 'Post_Grad',
str_detect(education, 'space') ~ 'Space_Academy',
TRUE ~ 'Other')) %>% dplyr::select(-education)
okc_new$edu_lvl <- as.factor(okc_new$edu_lvl)
# Let's get rid of the last online column, don't need it
okc_new <- okc_new %>% dplyr::select(-last_online)
# Let's change the position of the edu_lvl column and bring it closer to the front
okc_new <- okc_new %>% relocate(edu_lvl, .before = job)#### Partition the dataset according to genders
males <- okc_new %>% filter(sex == "m")
females <- okc_new %>% filter(sex != "m")
#### Proportion of genders in the dataset that reported income
# females
(count(females)/count(okc_new)) * 100 # about 33%# males
(count(males)/ count(okc_new)) * 100 # about 66.5%#### Exercise: Find the proportion of respective genders who did not report their incomesfemales_txt <- females %>% dplyr::select(age, income, edu_lvl, job, 22:31)Check the most common words by each column
summaryWords_F <- females_txt$self_summary %>%
str_replace_all("[[:punct:]]", "") %>% str_match_all("\\S+") %>%
unlist() %>% as.data.frame() %>% setNames("Words") %>% filter(!str_detect(Words, "\\d")) %>%
filter(str_detect(Words, "[:alnum:]")) %>%
group_by(Words) %>% summarize(count = n()) %>% anti_join(stopWords) %>%
mutate(prop = count/sum(count)) %>%
arrange(desc(count)) %>% print(n = 10)## # A tibble: 7,131 × 3
## Words count prop
## <chr> <int> <dbl>
## 1 im 913 0.0359
## 2 love 547 0.0215
## 3 life 296 0.0116
## 4 people 289 0.0114
## 5 dont 246 0.00967
## 6 time 232 0.00912
## 7 friends 181 0.00712
## 8 enjoy 164 0.00645
## 9 fun 154 0.00605
## 10 ive 128 0.00503
## # … with 7,121 more rows
summaryWords_F %>% head(50) %>% mutate(Words = fct_reorder(Words, count)) %>%
ggplot(aes(x = count, y = Words, fill = Words)) +
geom_bar(stat = "identity") +
theme_cowplot() +
labs(title = "Top 50 words used in Summary Section (Females)") +
scale_fill_viridis(discrete = TRUE) +
theme(legend.position = "none", axis.text.x = element_text(angle = 90))lifeWords_F <- females_txt$doing_with_my_life %>% str_replace_all("[[:punct:]]", "") %>%
str_match_all("\\S+") %>%
unlist() %>% as.data.frame() %>% setNames("Words") %>% filter(!str_detect(Words, "\\d")) %>%
filter(str_detect(Words, "[:alpha:]")) %>%
group_by(Words) %>% summarize(count = n()) %>% anti_join(stopWords) %>%
mutate(prop = count/sum(count)) %>%
arrange(desc(count)) %>% print(n = 10)## # A tibble: 3,561 × 3
## Words count prop
## <chr> <int> <dbl>
## 1 im 328 0.0341
## 2 time 168 0.0174
## 3 life 147 0.0153
## 4 love 126 0.0131
## 5 school 105 0.0109
## 6 job 72 0.00748
## 7 people 69 0.00717
## 8 friends 63 0.00654
## 9 ive 56 0.00582
## 10 living 52 0.00540
## # … with 3,551 more rows
lifeWords_F %>% head(50) %>% mutate(Words = fct_reorder(Words, count)) %>%
ggplot(aes(x = count, y = Words, fill = Words)) +
geom_bar(stat = "identity") +
theme_cowplot() +
scale_fill_viridis(discrete = TRUE) +
labs(title = "Top 50 words used in Doing with My Life Section (Females)") +
theme(legend.position = "none", axis.text.x = element_text(angle = 90))good_atWords_F <- females_txt$good_at %>% str_replace_all("[[:punct:]]", "") %>%
str_match_all("\\S+") %>%
unlist() %>% as.data.frame() %>% setNames("Words") %>% filter(!str_detect(Words, "\\d")) %>%
filter(str_detect(Words, "[:alpha:]")) %>%
group_by(Words) %>% summarize(count = n()) %>% anti_join(stopWords) %>%
mutate(prop = count/sum(count)) %>%
arrange(desc(count)) %>% print(n = 10)## # A tibble: 2,731 × 3
## Words count prop
## <chr> <int> <dbl>
## 1 im 238 0.0359
## 2 people 197 0.0297
## 3 cooking 87 0.0131
## 4 love 68 0.0103
## 5 listening 66 0.00996
## 6 dancing 58 0.00875
## 7 laugh 48 0.00724
## 8 friend 46 0.00694
## 9 writing 44 0.00664
## 10 dont 43 0.00649
## # … with 2,721 more rows
good_atWords_F %>% head(50) %>% mutate(Words = fct_reorder(Words, count)) %>%
ggplot(aes(x = count, y = Words, fill = Words)) +
geom_bar(stat = "identity") +
theme_cowplot() +
scale_fill_viridis(discrete = TRUE) +
labs(title = "Top 50 words used in Good At Section (Females)") +
theme(legend.position = "none", axis.text.x = element_text(angle = 90))abt_meWords_F <- females_txt$first_notice_abt_me %>% str_replace_all("[[:punct:]]", "") %>%
str_match_all("\\S+") %>%
unlist() %>% as.data.frame() %>% setNames("Words") %>% filter(!str_detect(Words, "\\d")) %>%
filter(str_detect(Words, "[:alpha:]")) %>%
group_by(Words) %>% summarize(count = n()) %>% anti_join(stopWords) %>%
mutate(prop = count/sum(count)) %>%
arrange(desc(count)) %>% print(n = 10)## # A tibble: 1,375 × 3
## Words count prop
## <chr> <int> <dbl>
## 1 im 193 0.0563
## 2 smile 161 0.0470
## 3 eyes 116 0.0338
## 4 people 94 0.0274
## 5 hair 74 0.0216
## 6 dont 47 0.0137
## 7 laugh 44 0.0128
## 8 notice 41 0.0120
## 9 pretty 38 0.0111
## 10 lot 30 0.00875
## # … with 1,365 more rows
abt_meWords_F %>% head(50) %>% mutate(Words = fct_reorder(Words, count)) %>%
ggplot(aes(x = count, y = Words, fill = Words)) +
geom_bar(stat = "identity") +
theme_cowplot() +
scale_fill_viridis(discrete = TRUE) +
labs(title = "Top 50 words used in First Thing to Notice About Section (Females)") +
theme(legend.position = "none", axis.text.x = element_text(angle = 90))hobbiesWords_F <- females_txt$fav_hobbies %>% str_replace_all("[[:punct:]]", "") %>%
str_match_all("\\S+") %>%
unlist() %>% as.data.frame() %>% setNames("Words") %>% filter(!str_detect(Words, "\\d")) %>%
filter(str_detect(Words, "[:alpha:]")) %>%
group_by(Words) %>% summarize(count = n()) %>% anti_join(stopWords) %>%
mutate(prop = count/sum(count)) %>%
arrange(desc(count)) %>% print(n = 10)## # A tibble: 8,374 × 3
## Words count prop
## <chr> <int> <dbl>
## 1 love 480 0.0175
## 2 music 441 0.0161
## 3 food 420 0.0153
## 4 movies 360 0.0131
## 5 books 310 0.0113
## 6 favorite 243 0.00887
## 7 im 240 0.00876
## 8 read 155 0.00565
## 9 dont 134 0.00489
## 10 time 121 0.00441
## # … with 8,364 more rows
hobbiesWords_F %>% head(50) %>% mutate(Words = fct_reorder(Words, count)) %>%
ggplot(aes(x = count, y = Words, fill = Words)) +
geom_bar(stat = "identity") +
theme_cowplot() +
scale_fill_viridis(discrete = TRUE) +
labs(title = "Top 50 words used in Hobbies Section (Females)") +
theme(legend.position = "none", axis.text.x = element_text(angle = 90))six_thingsWords_F <- females_txt$six_things %>% str_replace_all("[[:punct:]]", "") %>%
str_match_all("\\S+") %>%
unlist() %>% as.data.frame() %>% setNames("Words") %>% filter(!str_detect(Words, "\\d")) %>%
filter(str_detect(Words, "[:alpha:]")) %>%
group_by(Words) %>% summarize(count = n()) %>% anti_join(stopWords) %>%
mutate(prop = count/sum(count)) %>%
arrange(desc(count)) %>% print(n = 10)## # A tibble: 2,190 × 3
## Words count prop
## <chr> <int> <dbl>
## 1 friends 184 0.0339
## 2 family 158 0.0291
## 3 music 126 0.0232
## 4 love 94 0.0173
## 5 food 82 0.0151
## 6 water 60 0.0110
## 7 books 45 0.00828
## 8 coffee 42 0.00773
## 9 phone 41 0.00754
## 10 laughter 39 0.00718
## # … with 2,180 more rows
six_thingsWords_F %>% head(50) %>% mutate(Words = fct_reorder(Words, count)) %>%
ggplot(aes(x = count, y = Words, fill = Words)) +
geom_bar(stat = "identity") +
theme_cowplot() +
scale_fill_viridis(discrete = TRUE) +
labs(title = "Top 50 words used in 6 Things Section (Females)") +
theme(legend.position = "none", axis.text.x = element_text(angle = 90))think_abtWords_F <- females_txt$think_abt %>% str_replace_all("[[:punct:]]", "") %>%
str_match_all("\\S+") %>%
unlist() %>% as.data.frame() %>% setNames("Words") %>% filter(!str_detect(Words, "\\d")) %>%
filter(str_detect(Words, "[:alpha:]")) %>%
group_by(Words) %>% summarize(count = n()) %>% anti_join(stopWords) %>%
mutate(prop = count/sum(count)) %>%
arrange(desc(count)) %>% print(n = 10)## # A tibble: 2,445 × 3
## Words count prop
## <chr> <int> <dbl>
## 1 life 113 0.0223
## 2 im 89 0.0175
## 3 people 72 0.0142
## 4 time 68 0.0134
## 5 world 60 0.0118
## 6 future 59 0.0116
## 7 love 59 0.0116
## 8 lot 54 0.0106
## 9 thinking 54 0.0106
## 10 friends 28 0.00552
## # … with 2,435 more rows
think_abtWords_F %>% head(50) %>% mutate(Words = fct_reorder(Words, count)) %>%
ggplot(aes(x = count, y = Words, fill = Words)) +
geom_bar(stat = "identity") +
theme_cowplot() +
scale_fill_viridis(discrete = TRUE) +
labs(title = "Top 50 words used in Think About Section (Females)") +
theme(legend.position = "none", axis.text.x = element_text(angle = 90))friday_nightWords_F <- females_txt$friday_night_thing %>% str_replace_all("[[:punct:]]", "") %>%
str_match_all("\\S+") %>%
unlist() %>% as.data.frame() %>% setNames("Words") %>% filter(!str_detect(Words, "\\d")) %>%
filter(str_detect(Words, "[:alpha:]")) %>%
group_by(Words) %>% summarize(count = n()) %>% anti_join(stopWords) %>%
mutate(prop = count/sum(count)) %>%
arrange(desc(count)) %>% print(n = 10)## # A tibble: 1,631 × 3
## Words count prop
## <chr> <int> <dbl>
## 1 friends 183 0.0407
## 2 home 130 0.0289
## 3 watching 85 0.0189
## 4 movie 78 0.0174
## 5 im 77 0.0171
## 6 dinner 71 0.0158
## 7 friday 69 0.0154
## 8 night 64 0.0143
## 9 dancing 49 0.0109
## 10 hanging 45 0.0100
## # … with 1,621 more rows
friday_nightWords_F %>% head(50) %>% mutate(Words = fct_reorder(Words, count)) %>%
ggplot(aes(x = count, y = Words, fill = Words)) +
geom_bar(stat = "identity") +
theme_cowplot() +
scale_fill_viridis(discrete = TRUE) +
labs(title = "Top 50 words used in Friday Night Section (Females)") +
theme(legend.position = "none", axis.text.x = element_text(angle = 90))pvt_thingWords_F <- females_txt$most_pvt_thing %>% str_replace_all("[[:punct:]]", "") %>%
str_match_all("\\S+") %>%
unlist() %>% as.data.frame() %>% setNames("Words") %>% filter(!str_detect(Words, "\\d")) %>%
filter(str_detect(Words, "[:alpha:]")) %>%
group_by(Words) %>% summarize(count = n()) %>%
anti_join(stopWords) %>%
mutate(prop = count/sum(count)) %>%
arrange(desc(count)) %>% print(n = 10)## # A tibble: 1,997 × 3
## Words count prop
## <chr> <int> <dbl>
## 1 im 172 0.0475
## 2 dont 55 0.0152
## 3 love 50 0.0138
## 4 private 47 0.0130
## 5 ive 40 0.0110
## 6 ill 29 0.00801
## 7 time 26 0.00718
## 8 admit 24 0.00663
## 9 people 24 0.00663
## 10 person 20 0.00552
## # … with 1,987 more rows
pvt_thingWords_F %>% head(50) %>% mutate(Words = fct_reorder(Words, count)) %>%
ggplot(aes(x = count, y = Words, fill = Words)) +
geom_bar(stat = "identity") +
theme_cowplot() +
scale_fill_viridis(discrete = TRUE) +
labs(title = "Top 50 words used in Most Private Thing Section (Females)") +
theme(legend.position = "none", axis.text.x = element_text(angle = 90))msg_meWords_F <- females_txt$msg_me_if %>% str_replace_all("[[:punct:]]", "") %>%
str_match_all("\\S+") %>%
unlist() %>% as.data.frame() %>% setNames("Words") %>% filter(!str_detect(Words, "\\d")) %>%
filter(str_detect(Words, "[:alpha:]")) %>%
group_by(Words) %>% summarize(count = n()) %>% anti_join(stopWords) %>%
mutate(prop = count/sum(count)) %>%
arrange(desc(count)) %>% print(n = 10)## # A tibble: 2,778 × 3
## Words count prop
## <chr> <int> <dbl>
## 1 youre 164 0.0230
## 2 dont 107 0.0150
## 3 im 103 0.0144
## 4 life 84 0.0118
## 5 fun 71 0.00995
## 6 people 63 0.00883
## 7 love 61 0.00855
## 8 message 59 0.00827
## 9 enjoy 46 0.00645
## 10 person 46 0.00645
## # … with 2,768 more rows
msg_meWords_F %>% head(50) %>% mutate(Words = fct_reorder(Words, count)) %>%
ggplot(aes(x = count, y = Words, fill = Words)) +
geom_bar(stat = "identity") +
theme_cowplot() +
scale_fill_viridis(discrete = TRUE) +
labs(title = "Top 50 words used in Message Me If Section (Females)") +
theme(legend.position = "none", axis.text.x = element_text(angle = 90))Let’s do some text analysis on the males column
males_txt <- males %>% dplyr::select(age, income, edu_lvl, job, 22:31)summaryWords_M <- males_txt$self_summary %>%
str_replace_all("[[:punct:]]", "") %>% str_match_all("\\S+") %>%
unlist() %>% as.data.frame() %>% setNames("Words") %>% filter(!str_detect(Words, "\\d")) %>%
filter(str_detect(Words, "[:alnum:]")) %>%
group_by(Words) %>% summarize(count = n()) %>% anti_join(stopWords) %>%
mutate(prop = count/sum(count)) %>%
arrange(desc(count)) %>% print(n = 10)## # A tibble: 10,762 × 3
## Words count prop
## <chr> <int> <dbl>
## 1 im 1660 0.0361
## 2 love 787 0.0171
## 3 life 526 0.0114
## 4 people 462 0.0100
## 5 time 389 0.00845
## 6 dont 382 0.00830
## 7 ive 272 0.00591
## 8 friends 266 0.00578
## 9 enjoy 254 0.00552
## 10 fun 218 0.00474
## # … with 10,752 more rows
summaryWords_M %>% head(50) %>% mutate(Words = fct_reorder(Words, count)) %>%
ggplot(aes(x = count, y = Words, fill = Words)) +
geom_bar(stat = "identity") +
theme_cowplot() +
scale_fill_viridis(discrete = TRUE) +
labs(title = "Top 50 words used in Summary Section (Males)") +
theme(legend.position = "none", axis.text.x = element_text(angle = 90))# 2. Doing with life:
lifeWords_M <- males_txt$doing_with_my_life %>% str_replace_all("[[:punct:]]", "") %>%
str_match_all("\\S+") %>%
unlist() %>% as.data.frame() %>% setNames("Words") %>% filter(!str_detect(Words, "\\d")) %>%
filter(str_detect(Words, "[:alpha:]")) %>%
group_by(Words) %>% summarize(count = n()) %>% anti_join(stopWords) %>%
mutate(prop = count/sum(count)) %>%
arrange(desc(count)) %>% print(n = 10)## # A tibble: 5,751 × 3
## Words count prop
## <chr> <int> <dbl>
## 1 im 640 0.0339
## 2 time 234 0.0124
## 3 life 220 0.0117
## 4 love 194 0.0103
## 5 job 140 0.00742
## 6 friends 138 0.00731
## 7 school 131 0.00694
## 8 ive 117 0.00620
## 9 people 116 0.00615
## 10 day 106 0.00562
## # … with 5,741 more rows
lifeWords_M %>% head(50) %>% mutate(Words = fct_reorder(Words, count)) %>%
ggplot(aes(x = count, y = Words, fill = Words)) +
geom_bar(stat = "identity") +
theme_cowplot() +
scale_fill_viridis(discrete = TRUE) +
labs(title = "Top 50 words used in Doing with My Life Section (Males)") +
theme(legend.position = "none", axis.text.x = element_text(angle = 90))# 3. Good at:
good_atWords_M <- males_txt$good_at %>% str_replace_all("[[:punct:]]", "") %>%
str_match_all("\\S+") %>%
unlist() %>% as.data.frame() %>% setNames("Words") %>% filter(!str_detect(Words, "\\d")) %>%
filter(str_detect(Words, "[:alpha:]")) %>%
group_by(Words) %>% summarize(count = n()) %>% anti_join(stopWords) %>%
mutate(prop = count/sum(count)) %>%
arrange(desc(count)) %>% print(n = 10)## # A tibble: 4,725 × 3
## Words count prop
## <chr> <int> <dbl>
## 1 im 388 0.0289
## 2 people 300 0.0223
## 3 cooking 139 0.0103
## 4 listening 130 0.00967
## 5 laugh 99 0.00736
## 6 pretty 99 0.00736
## 7 love 98 0.00729
## 8 playing 84 0.00625
## 9 games 67 0.00498
## 10 music 67 0.00498
## # … with 4,715 more rows
good_atWords_M %>% head(50) %>% mutate(Words = fct_reorder(Words, count)) %>%
ggplot(aes(x = count, y = Words, fill = Words)) +
geom_bar(stat = "identity") +
theme_cowplot() +
scale_fill_viridis(discrete = TRUE) +
labs(title = "Top 50 words used in Good At Section (Males)") +
theme(legend.position = "none", axis.text.x = element_text(angle = 90))# 4. First thing people notice about me:
abt_meWords_M <- males_txt$first_notice_abt_me %>% str_replace_all("[[:punct:]]", "") %>%
str_match_all("\\S+") %>%
unlist() %>% as.data.frame() %>% setNames("Words") %>% filter(!str_detect(Words, "\\d")) %>%
filter(str_detect(Words, "[:alpha:]")) %>%
group_by(Words) %>% summarize(count = n()) %>% anti_join(stopWords) %>%
mutate(prop = count/sum(count)) %>%
arrange(desc(count)) %>% print(n = 10)## # A tibble: 2,444 × 3
## Words count prop
## <chr> <int> <dbl>
## 1 im 386 0.0565
## 2 eyes 193 0.0282
## 3 people 191 0.0279
## 4 smile 165 0.0241
## 5 hair 118 0.0173
## 6 notice 93 0.0136
## 7 dont 87 0.0127
## 8 humor 67 0.00980
## 9 lot 60 0.00878
## 10 sense 58 0.00849
## # … with 2,434 more rows
abt_meWords_M %>% head(50) %>% mutate(Words = fct_reorder(Words, count)) %>%
ggplot(aes(x = count, y = Words, fill = Words)) +
geom_bar(stat = "identity") +
theme_cowplot() +
scale_fill_viridis(discrete = TRUE) +
labs(title = "Top 50 words used in First Thing to Notice About Section (Males)") +
theme(legend.position = "none", axis.text.x = element_text(angle = 90))# 5. Favorite Hobbies:
hobbiesWords_M <- males_txt$fav_hobbies %>% str_replace_all("[[:punct:]]", "") %>%
str_match_all("\\S+") %>%
unlist() %>% as.data.frame() %>% setNames("Words") %>% filter(!str_detect(Words, "\\d")) %>%
filter(str_detect(Words, "[:alpha:]")) %>%
group_by(Words) %>% summarize(count = n()) %>% anti_join(stopWords) %>%
mutate(prop = count/sum(count)) %>%
arrange(desc(count)) %>% print(n = 10)## # A tibble: 12,843 × 3
## Words count prop
## <chr> <int> <dbl>
## 1 music 793 0.0147
## 2 food 757 0.0141
## 3 movies 706 0.0131
## 4 books 604 0.0112
## 5 love 559 0.0104
## 6 im 444 0.00826
## 7 favorite 366 0.00680
## 8 dont 257 0.00478
## 9 read 234 0.00435
## 10 tv 233 0.00433
## # … with 12,833 more rows
hobbiesWords_M %>% head(50) %>% mutate(Words = fct_reorder(Words, count)) %>%
ggplot(aes(x = count, y = Words, fill = Words)) +
geom_bar(stat = "identity") +
theme_cowplot() +
scale_fill_viridis(discrete = TRUE) +
labs(title = "Top 50 words used in Hobbies Section (Males)") +
theme(legend.position = "none", axis.text.x = element_text(angle = 90))# 6. Six things:
six_thingsWords_M <- males_txt$six_things %>% str_replace_all("[[:punct:]]", "") %>%
str_match_all("\\S+") %>%
unlist() %>% as.data.frame() %>% setNames("Words") %>% filter(!str_detect(Words, "\\d")) %>%
filter(str_detect(Words, "[:alpha:]")) %>%
group_by(Words) %>% summarize(count = n()) %>% anti_join(stopWords) %>%
mutate(prop = count/sum(count)) %>%
arrange(desc(count)) %>% print(n = 10)## # A tibble: 3,611 × 3
## Words count prop
## <chr> <int> <dbl>
## 1 friends 341 0.0325
## 2 music 258 0.0246
## 3 family 255 0.0243
## 4 food 207 0.0198
## 5 love 142 0.0136
## 6 water 100 0.00954
## 7 phone 88 0.00840
## 8 internet 80 0.00764
## 9 im 77 0.00735
## 10 books 72 0.00687
## # … with 3,601 more rows
six_thingsWords_M %>% head(50) %>% mutate(Words = fct_reorder(Words, count)) %>%
ggplot(aes(x = count, y = Words, fill = Words)) +
geom_bar(stat = "identity") +
theme_cowplot() +
scale_fill_viridis(discrete = TRUE) +
labs(title = "Top 50 words used in 6 Things Section (Males)") +
theme(legend.position = "none", axis.text.x = element_text(angle = 90))# 7. I think a lot about:
think_abtWords_M <- males_txt$think_abt %>% str_replace_all("[[:punct:]]", "") %>%
str_match_all("\\S+") %>%
unlist() %>% as.data.frame() %>% setNames("Words") %>% filter(!str_detect(Words, "\\d")) %>%
filter(str_detect(Words, "[:alpha:]")) %>%
group_by(Words) %>% summarize(count = n()) %>% anti_join(stopWords) %>%
mutate(prop = count/sum(count)) %>%
arrange(desc(count)) %>% print(n = 10)## # A tibble: 4,106 × 3
## Words count prop
## <chr> <int> <dbl>
## 1 life 229 0.0230
## 2 im 178 0.0179
## 3 people 149 0.0150
## 4 future 125 0.0126
## 5 time 125 0.0126
## 6 world 115 0.0115
## 7 thinking 111 0.0111
## 8 lot 85 0.00853
## 9 friends 60 0.00602
## 10 spend 60 0.00602
## # … with 4,096 more rows
think_abtWords_M %>% head(50) %>% mutate(Words = fct_reorder(Words, count)) %>%
ggplot(aes(x = count, y = Words, fill = Words)) +
geom_bar(stat = "identity") +
theme_cowplot() +
scale_fill_viridis(discrete = TRUE) +
labs(title = "Top 50 words used in Think About Section (Males)") +
theme(legend.position = "none", axis.text.x = element_text(angle = 90))# 8. On a typical Friday night:
friday_nightWords_M <- males_txt$friday_night_thing %>% str_replace_all("[[:punct:]]", "") %>%
str_match_all("\\S+") %>%
unlist() %>% as.data.frame() %>% setNames("Words") %>% filter(!str_detect(Words, "\\d")) %>%
filter(str_detect(Words, "[:alpha:]")) %>%
group_by(Words) %>% summarize(count = n()) %>% anti_join(stopWords) %>%
mutate(prop = count/sum(count)) %>%
arrange(desc(count)) %>% print(n = 10)## # A tibble: 2,721 × 3
## Words count prop
## <chr> <int> <dbl>
## 1 friends 378 0.0441
## 2 home 174 0.0203
## 3 im 171 0.0200
## 4 watching 128 0.0149
## 5 movie 116 0.0135
## 6 friday 115 0.0134
## 7 hanging 115 0.0134
## 8 night 110 0.0128
## 9 dinner 101 0.0118
## 10 time 81 0.00945
## # … with 2,711 more rows
friday_nightWords_M %>% head(50) %>% mutate(Words = fct_reorder(Words, count)) %>%
ggplot(aes(x = count, y = Words, fill = Words)) +
geom_bar(stat = "identity") +
theme_cowplot() +
scale_fill_viridis(discrete = TRUE) +
labs(title = "Top 50 words used in Friday Night Section (Males)") +
theme(legend.position = "none", axis.text.x = element_text(angle = 90))# 9. Most private thing I can reveal:
pvt_thingWords_M <- males_txt$most_pvt_thing %>% str_replace_all("[[:punct:]]", "") %>%
str_match_all("\\S+") %>%
unlist() %>% as.data.frame() %>% setNames("Words") %>% filter(!str_detect(Words, "\\d")) %>%
filter(str_detect(Words, "[:alpha:]")) %>%
group_by(Words) %>% summarize(count = n()) %>% anti_join(stopWords) %>%
mutate(prop = count/sum(count)) %>%
arrange(desc(count)) %>% print(n = 10)## # A tibble: 3,400 × 3
## Words count prop
## <chr> <int> <dbl>
## 1 im 294 0.0403
## 2 dont 98 0.0134
## 3 love 98 0.0134
## 4 people 76 0.0104
## 5 private 76 0.0104
## 6 ive 63 0.00863
## 7 time 59 0.00808
## 8 admit 53 0.00726
## 9 ill 53 0.00726
## 10 life 35 0.00479
## # … with 3,390 more rows
pvt_thingWords_M %>% head(50) %>% mutate(Words = fct_reorder(Words, count)) %>%
ggplot(aes(x = count, y = Words, fill = Words)) +
geom_bar(stat = "identity") +
theme_cowplot() +
scale_fill_viridis(discrete = TRUE) +
labs(title = "Top 50 words used in Most Private Thing Section (Males)") +
theme(legend.position = "none", axis.text.x = element_text(angle = 90))msg_meWords_M <- males_txt$msg_me_if %>% str_replace_all("[[:punct:]]", "") %>%
str_match_all("\\S+") %>%
unlist() %>% as.data.frame() %>% setNames("Words") %>% filter(!str_detect(Words, "\\d")) %>%
filter(str_detect(Words, "[:alpha:]")) %>%
group_by(Words) %>% summarize(count = n()) %>% anti_join(stopWords) %>%
mutate(prop = count/sum(count)) %>%
arrange(desc(count)) %>% print(n = 10)## # A tibble: 4,056 × 3
## Words count prop
## <chr> <int> <dbl>
## 1 youre 303 0.0251
## 2 dont 176 0.0146
## 3 im 174 0.0144
## 4 fun 143 0.0118
## 5 love 132 0.0109
## 6 message 124 0.0103
## 7 life 116 0.00959
## 8 time 98 0.00811
## 9 meet 89 0.00736
## 10 enjoy 87 0.00720
## # … with 4,046 more rows
msg_meWords_M %>% head(50) %>% mutate(Words = fct_reorder(Words, count)) %>%
ggplot(aes(x = count, y = Words, fill = Words)) +
geom_bar(stat = "identity") +
theme_cowplot() +
scale_fill_viridis(discrete = TRUE) +
labs(title = "Top 50 words used in Message Me If Section (Males)") +
theme(legend.position = "none", axis.text.x = element_text(angle = 90))Let’s do a proportional study, to understand what is the affinity of usage of words by each gender in each of the 10 sections.We’ll consider Females as the target group, and see what is their affinity to use a certain word relative to their male counterparts
inner_join(summaryWords_F, summaryWords_M, by = "Words") %>%
rename(prop_f = prop.x, prop_m = prop.y) %>% mutate(affinityToWords = prop_f/prop_m) %>%
dplyr::select(Words, prop_f, prop_m, affinityToWords) %>% arrange(desc(affinityToWords))Women like to use the words “dreamer”, “piercing” almost 1350% more than men…
inner_join(lifeWords_F, lifeWords_M, by = "Words") %>%
rename(prop_f = prop.x, prop_m = prop.y) %>% mutate(affinityToWords = prop_f/prop_m) %>%
dplyr::select(Words, prop_f, prop_m, affinityToWords) %>% arrange(desc(affinityToWords))inner_join(good_atWords_F, good_atWords_M, by = "Words") %>%
rename(prop_f = prop.x, prop_m = prop.y) %>% mutate(affinityToWords = prop_f/prop_m) %>%
dplyr::select(Words, prop_f, prop_m, affinityToWords) %>% arrange(desc(affinityToWords))inner_join(abt_meWords_F, abt_meWords_M, by = "Words") %>%
rename(prop_f = prop.x, prop_m = prop.y) %>% mutate(affinityToWords = prop_f/prop_m) %>%
dplyr::select(Words, prop_f, prop_m, affinityToWords) %>% arrange(desc(affinityToWords))inner_join(hobbiesWords_F, hobbiesWords_M, by = "Words") %>%
rename(prop_f = prop.x, prop_m = prop.y) %>% mutate(affinityToWords = prop_f/prop_m) %>%
dplyr::select(Words, prop_f, prop_m, affinityToWords) %>% arrange(desc(affinityToWords))inner_join(six_thingsWords_F, six_thingsWords_M, by = "Words") %>%
rename(prop_f = prop.x, prop_m = prop.y) %>% mutate(affinityToWords = prop_f/prop_m) %>%
dplyr::select(Words, prop_f, prop_m, affinityToWords) %>% arrange(desc(affinityToWords))inner_join(think_abtWords_F, think_abtWords_M, by = "Words") %>%
rename(prop_f = prop.x, prop_m = prop.y) %>% mutate(affinityToWords = prop_f/prop_m) %>%
dplyr::select(Words, prop_f, prop_m, affinityToWords) %>% arrange(desc(affinityToWords))inner_join(pvt_thingWords_F, pvt_thingWords_M, by = "Words") %>%
rename(prop_f = prop.x, prop_m = prop.y) %>% mutate(affinityToWords = prop_f/prop_m) %>%
dplyr::select(Words, prop_f, prop_m, affinityToWords) %>% arrange(desc(affinityToWords))inner_join(msg_meWords_F, msg_meWords_M, by = "Words") %>%
rename(prop_f = prop.x, prop_m = prop.y) %>% mutate(affinityToWords = prop_f/prop_m) %>%
dplyr::select(Words, prop_f, prop_m, affinityToWords) %>% arrange(desc(affinityToWords))inner_join(summaryWords_F, summaryWords_M, by = "Words") %>%
rename(prop_f = prop.x, prop_m = prop.y) %>% mutate(affinityToWords = prop_m/prop_f) %>%
dplyr::select(Words, prop_f, prop_m, affinityToWords) %>% arrange(desc(affinityToWords))inner_join(lifeWords_F, lifeWords_M, by = "Words") %>%
rename(prop_f = prop.x, prop_m = prop.y) %>% mutate(affinityToWords = prop_m/prop_f) %>%
dplyr::select(Words, prop_f, prop_m, affinityToWords) %>% arrange(desc(affinityToWords))inner_join(good_atWords_F, good_atWords_M, by = "Words") %>%
rename(prop_f = prop.x, prop_m = prop.y) %>% mutate(affinityToWords = prop_m/prop_f) %>%
dplyr::select(Words, prop_f, prop_m, affinityToWords) %>% arrange(desc(affinityToWords))inner_join(abt_meWords_F, abt_meWords_M, by = "Words") %>%
rename(prop_f = prop.x, prop_m = prop.y) %>% mutate(affinityToWords = prop_m/prop_f) %>%
dplyr::select(Words, prop_f, prop_m, affinityToWords) %>% arrange(desc(affinityToWords))inner_join(hobbiesWords_F, hobbiesWords_M, by = "Words") %>%
rename(prop_f = prop.x, prop_m = prop.y) %>% mutate(affinityToWords = prop_m/prop_f) %>%
dplyr::select(Words, prop_f, prop_m, affinityToWords) %>% arrange(desc(affinityToWords))inner_join(six_thingsWords_F, six_thingsWords_M, by = "Words") %>%
rename(prop_f = prop.x, prop_m = prop.y) %>% mutate(affinityToWords = prop_m/prop_f) %>%
dplyr::select(Words, prop_f, prop_m, affinityToWords) %>% arrange(desc(affinityToWords))inner_join(think_abtWords_F, think_abtWords_M, by = "Words") %>%
rename(prop_f = prop.x, prop_m = prop.y) %>% mutate(affinityToWords = prop_m/prop_f) %>%
dplyr::select(Words, prop_f, prop_m, affinityToWords) %>% arrange(desc(affinityToWords))inner_join(pvt_thingWords_F, pvt_thingWords_M, by = "Words") %>%
rename(prop_f = prop.x, prop_m = prop.y) %>% mutate(affinityToWords = prop_m/prop_f) %>%
dplyr::select(Words, prop_f, prop_m, affinityToWords) %>% arrange(desc(affinityToWords))inner_join(msg_meWords_F, msg_meWords_M, by = "Words") %>%
rename(prop_f = prop.x, prop_m = prop.y) %>% mutate(affinityToWords = prop_m/prop_f) %>%
dplyr::select(Words, prop_f, prop_m, affinityToWords) %>% arrange(desc(affinityToWords))