1 Load and prepare data

options(scipen = 999)

#### Load libraries
library(ggplot2)
library(tidyverse)
library(tibble)
library(cowplot)
library(fitdistrplus)
library(readr)
library(readxl)
library(stringr)
library(ggmosaic)
library(tidytext)
library(prettydoc)
library(viridis)

#### Load dataset
okc_df <- read.csv("okcupid_profiles.csv")

okc_new <- okc_df

#### Need to rename the following columns:

# essay0- My self summary 
# essay1- What I’m doing with my life
# essay2- I’m really good at
# essay3- The first thing people usually notice about me
# essay4- Favorite books, movies, show, music, and food
# essay5- The six things I could never do without
# essay6- I spend a lot of time thinking about
# essay7- On a typical Friday night I am
# essay8- The most private thing I am willing to admit
# essay9- You should message me if...

okc_new <- okc_new %>% rename(
          self_summary = essay0,
          doing_with_my_life = essay1,
          good_at = essay2,
          first_notice_abt_me = essay3,
          fav_hobbies = essay4,
          six_things = essay5,
          think_abt = essay6,
          friday_night_thing = essay7,
          most_pvt_thing = essay8,
          msg_me_if = essay9)


# Let's check the dataframe
glimpse(okc_new)

## Rows: 59,946
## Columns: 31
## $ age                 <int> 22, 35, 38, 23, 29, 29, 32, 31, 24, 37, 35, 28, 24…
## $ status              <chr> "single", "single", "available", "single", "single…
## $ sex                 <chr> "m", "m", "m", "m", "m", "m", "f", "f", "f", "m", …
## $ orientation         <chr> "straight", "straight", "straight", "straight", "s…
## $ body_type           <chr> "a little extra", "average", "thin", "thin", "athl…
## $ diet                <chr> "strictly anything", "mostly other", "anything", "…
## $ drinks              <chr> "socially", "often", "socially", "socially", "soci…
## $ drugs               <chr> "never", "sometimes", "", "", "never", "", "never"…
## $ education           <chr> "working on college/university", "working on space…
## $ ethnicity           <chr> "asian, white", "white", "", "white", "asian, blac…
## $ height              <dbl> 75, 70, 68, 71, 66, 67, 65, 65, 67, 65, 70, 72, 72…
## $ income              <int> -1, 80000, -1, 20000, -1, -1, -1, -1, -1, -1, -1, …
## $ job                 <chr> "transportation", "hospitality / travel", "", "stu…
## $ last_online         <chr> "2012-06-28-20-30", "2012-06-29-21-41", "2012-06-2…
## $ location            <chr> "south san francisco, california", "oakland, calif…
## $ offspring           <chr> "doesn't have kids, but might want them", "doesn't…
## $ pets                <chr> "likes dogs and likes cats", "likes dogs and likes…
## $ religion            <chr> "agnosticism and very serious about it", "agnostic…
## $ sign                <chr> "gemini", "cancer", "pisces but it doesn&rsquo;t m…
## $ smokes              <chr> "sometimes", "no", "no", "no", "no", "no", "", "no…
## $ speaks              <chr> "english", "english (fluently), spanish (poorly), …
## $ self_summary        <chr> "about me:  i would love to think that i was some …
## $ doing_with_my_life  <chr> "currently working as an international agent for a…
## $ good_at             <chr> "making people laugh. ranting about a good salting…
## $ first_notice_abt_me <chr> "the way i look. i am a six foot half asian, half …
## $ fav_hobbies         <chr> "books: absurdistan, the republic, of mice and men…
## $ six_things          <chr> "food. water. cell phone. shelter.", "delicious po…
## $ think_abt           <chr> "duality and humorous things", "", "", "cats and g…
## $ friday_night_thing  <chr> "trying to find someone to hang out with. i am dow…
## $ most_pvt_thing      <chr> "i am new to california and looking for someone to…
## $ msg_me_if           <chr> "you want to be swept off your feet! you are tired…

# Remove all the empty cells
okc_new <- okc_new %>% na_if("") %>% na.omit()

# Let's change some of the column structures
okc_new$status <- as.factor(okc_new$status)
okc_new$sex <- as.factor(okc_new$sex)
okc_new$orientation <- as.factor(okc_new$orientation)
okc_new$body_type <- as.factor(okc_new$body_type)
okc_new$diet <- as.factor(okc_new$diet)
okc_new$drinks <- as.factor(okc_new$drinks)
okc_new$drugs <- as.factor(okc_new$drugs)
okc_new$education <- as.factor(okc_new$education)
okc_new$ethnicity <- as.factor(okc_new$ethnicity)
okc_new$job <- as.factor(okc_new$job)
okc_new$offspring <- as.factor(okc_new$offspring)
okc_new$pets <- as.factor(okc_new$pets)
okc_new$religion <- as.factor(okc_new$religion)
okc_new$sign <- as.factor(okc_new$sign)
okc_new$smokes <- as.factor(okc_new$smoke)

# Let's recheck the dataframe
glimpse(okc_new)

## Rows: 4,407
## Columns: 31
## $ age                 <int> 22, 33, 30, 29, 31, 45, 23, 50, 21, 50, 26, 26, 27…
## $ status              <fct> single, single, single, single, single, single, si…
## $ sex                 <fct> m, m, m, m, f, f, f, m, m, m, f, f, m, m, m, m, f,…
## $ orientation         <fct> straight, straight, straight, straight, straight, …
## $ body_type           <fct> a little extra, athletic, fit, fit, curvy, fit, cu…
## $ diet                <fct> strictly anything, mostly anything, mostly anythin…
## $ drinks              <fct> socially, socially, socially, socially, socially, …
## $ drugs               <fct> never, never, never, sometimes, sometimes, never, …
## $ education           <fct> working on college/university, graduated from mast…
## $ ethnicity           <fct> "asian, white", "white", "white", "white", "white"…
## $ height              <dbl> 75, 72, 69, 67, 66, 64, 66, 75, 70, 73, 63, 68, 68…
## $ income              <int> -1, -1, -1, 40000, -1, -1, 20000, -1, 1000000, 800…
## $ job                 <fct> transportation, science / tech / engineering, exec…
## $ last_online         <chr> "2012-06-28-20-30", "2012-06-27-21-41", "2012-06-2…
## $ location            <chr> "south san francisco, california", "san francisco,…
## $ offspring           <fct> "doesn't have kids, but might want them", "doesn't…
## $ pets                <fct> likes dogs and likes cats, likes dogs and likes ca…
## $ religion            <fct> agnosticism and very serious about it, catholicism…
## $ sign                <fct> gemini, pisces and it&rsquo;s fun to think about, …
## $ smokes              <fct> sometimes, no, no, no, no, no, when drinking, no, …
## $ speaks              <chr> "english", "english (fluently)", "english, spanish…
## $ self_summary        <chr> "about me:  i would love to think that i was some …
## $ doing_with_my_life  <chr> "currently working as an international agent for a…
## $ good_at             <chr> "making people laugh. ranting about a good salting…
## $ first_notice_abt_me <chr> "the way i look. i am a six foot half asian, half …
## $ fav_hobbies         <chr> "books: absurdistan, the republic, of mice and men…
## $ six_things          <chr> "food. water. cell phone. shelter.", "juicy fruit,…
## $ think_abt           <chr> "duality and humorous things", "my passions and se…
## $ friday_night_thing  <chr> "trying to find someone to hang out with. i am dow…
## $ most_pvt_thing      <chr> "i am new to california and looking for someone to…
## $ msg_me_if           <chr> "you want to be swept off your feet! you are tired…

# Looks like the entry before the comma in the Location Column is the name of the city. 
okc_new <- okc_new %>% separate(location, c("City", "State"), sep = ",")
okc_new$City <- as.factor(okc_new$City)
okc_new$State <- as.factor(okc_new$State)

# Looks like there are some negative values for income. Let's get rid of them
okc_new <- okc_new %>% filter(income >= 0)

#### Excercise: Find out what proportion of the dataset did not report their income

#### Let's look at the education levels
okc_new %>% dplyr::select(education) %>% distinct()

# Stop words
stopWords <- stop_words %>% rename(Words = word)

Let’s classify education in the following categories:

Post Graduate Degree
University Degree
High School
Unknown

#### Create vector Edu_Lvl
okc_new <- okc_new %>% 
  mutate(edu_lvl = case_when(str_detect(education, "high") ~ 'High_School',
                                                  str_detect(education, 'college|university"') ~ 'College',
                                                  str_detect(education, 'ph|masters|law|med') ~ 'Post_Grad',
                                                  str_detect(education, 'space') ~ 'Space_Academy',
                                                  TRUE ~ 'Other')) %>% dplyr::select(-education)
okc_new$edu_lvl <- as.factor(okc_new$edu_lvl)

# Let's get rid of the last online column, don't need it
okc_new <- okc_new %>% dplyr::select(-last_online)

# Let's change the position of the edu_lvl column and bring it closer to the front
okc_new <- okc_new %>% relocate(edu_lvl, .before = job)

2 Partition by gender (M/F)

#### Partition the dataset according to genders
males <- okc_new %>% filter(sex == "m")
females <- okc_new %>% filter(sex != "m")

#### Proportion of genders in the dataset that reported income
# females
(count(females)/count(okc_new)) * 100 # about 33%

# males
(count(males)/ count(okc_new)) * 100 # about 66.5%

#### Exercise: Find the proportion of respective genders who did not report their incomes

3 Text Analysis

3.1 Females

females_txt <- females %>% dplyr::select(age, income, edu_lvl, job, 22:31)

Check the most common words by each column

3.1.1 Summary:

summaryWords_F <- females_txt$self_summary %>% 
  str_replace_all("[[:punct:]]", "") %>% str_match_all("\\S+") %>%  
  unlist() %>% as.data.frame() %>% setNames("Words") %>% filter(!str_detect(Words, "\\d")) %>%
  filter(str_detect(Words, "[:alnum:]")) %>%
  group_by(Words) %>% summarize(count = n()) %>% anti_join(stopWords) %>%
  mutate(prop = count/sum(count)) %>% 
  arrange(desc(count)) %>% print(n = 10)

## # A tibble: 7,131 × 3
##    Words   count    prop
##    <chr>   <int>   <dbl>
##  1 im        913 0.0359 
##  2 love      547 0.0215 
##  3 life      296 0.0116 
##  4 people    289 0.0114 
##  5 dont      246 0.00967
##  6 time      232 0.00912
##  7 friends   181 0.00712
##  8 enjoy     164 0.00645
##  9 fun       154 0.00605
## 10 ive       128 0.00503
## # … with 7,121 more rows

summaryWords_F %>% head(50) %>% mutate(Words = fct_reorder(Words, count)) %>%
  ggplot(aes(x = count, y = Words, fill = Words)) + 
  geom_bar(stat = "identity") +
  theme_cowplot() +
  labs(title = "Top 50 words used in Summary Section (Females)") +
  scale_fill_viridis(discrete = TRUE) +
  theme(legend.position = "none", axis.text.x = element_text(angle = 90))

3.1.2 Doing with life:

lifeWords_F <- females_txt$doing_with_my_life %>% str_replace_all("[[:punct:]]", "") %>% 
  str_match_all("\\S+") %>% 
  unlist() %>% as.data.frame() %>% setNames("Words") %>% filter(!str_detect(Words, "\\d")) %>%
  filter(str_detect(Words, "[:alpha:]")) %>%
  group_by(Words) %>% summarize(count = n()) %>% anti_join(stopWords) %>%
  mutate(prop = count/sum(count)) %>% 
  arrange(desc(count)) %>% print(n = 10)

## # A tibble: 3,561 × 3
##    Words   count    prop
##    <chr>   <int>   <dbl>
##  1 im        328 0.0341 
##  2 time      168 0.0174 
##  3 life      147 0.0153 
##  4 love      126 0.0131 
##  5 school    105 0.0109 
##  6 job        72 0.00748
##  7 people     69 0.00717
##  8 friends    63 0.00654
##  9 ive        56 0.00582
## 10 living     52 0.00540
## # … with 3,551 more rows

lifeWords_F %>% head(50) %>% mutate(Words = fct_reorder(Words, count)) %>%
  ggplot(aes(x = count, y = Words, fill = Words)) + 
  geom_bar(stat = "identity") +
  theme_cowplot() +
  scale_fill_viridis(discrete = TRUE) +
  labs(title = "Top 50 words used in Doing with My Life Section (Females)") +
  theme(legend.position = "none", axis.text.x = element_text(angle = 90))

3.1.3 Good at:

good_atWords_F <- females_txt$good_at %>% str_replace_all("[[:punct:]]", "") %>% 
  str_match_all("\\S+") %>% 
  unlist() %>% as.data.frame() %>% setNames("Words") %>% filter(!str_detect(Words, "\\d")) %>%
  filter(str_detect(Words, "[:alpha:]")) %>%
  group_by(Words) %>% summarize(count = n()) %>% anti_join(stopWords) %>%
  mutate(prop = count/sum(count)) %>% 
  arrange(desc(count)) %>% print(n = 10)

## # A tibble: 2,731 × 3
##    Words     count    prop
##    <chr>     <int>   <dbl>
##  1 im          238 0.0359 
##  2 people      197 0.0297 
##  3 cooking      87 0.0131 
##  4 love         68 0.0103 
##  5 listening    66 0.00996
##  6 dancing      58 0.00875
##  7 laugh        48 0.00724
##  8 friend       46 0.00694
##  9 writing      44 0.00664
## 10 dont         43 0.00649
## # … with 2,721 more rows

good_atWords_F %>% head(50) %>% mutate(Words = fct_reorder(Words, count)) %>%
  ggplot(aes(x = count, y = Words, fill = Words)) + 
  geom_bar(stat = "identity") +
  theme_cowplot() +
  scale_fill_viridis(discrete = TRUE) +
  labs(title = "Top 50 words used in Good At Section (Females)") +
  theme(legend.position = "none", axis.text.x = element_text(angle = 90))

3.1.4 First thing people notice about me:

abt_meWords_F <- females_txt$first_notice_abt_me %>% str_replace_all("[[:punct:]]", "") %>% 
  str_match_all("\\S+") %>% 
  unlist() %>% as.data.frame() %>% setNames("Words") %>% filter(!str_detect(Words, "\\d")) %>%
  filter(str_detect(Words, "[:alpha:]")) %>%
  group_by(Words) %>% summarize(count = n()) %>% anti_join(stopWords) %>%
  mutate(prop = count/sum(count)) %>% 
  arrange(desc(count)) %>% print(n = 10)

## # A tibble: 1,375 × 3
##    Words  count    prop
##    <chr>  <int>   <dbl>
##  1 im       193 0.0563 
##  2 smile    161 0.0470 
##  3 eyes     116 0.0338 
##  4 people    94 0.0274 
##  5 hair      74 0.0216 
##  6 dont      47 0.0137 
##  7 laugh     44 0.0128 
##  8 notice    41 0.0120 
##  9 pretty    38 0.0111 
## 10 lot       30 0.00875
## # … with 1,365 more rows

abt_meWords_F %>% head(50) %>% mutate(Words = fct_reorder(Words, count)) %>%
  ggplot(aes(x = count, y = Words, fill = Words)) + 
  geom_bar(stat = "identity") +
  theme_cowplot() +
  scale_fill_viridis(discrete = TRUE) +
  labs(title = "Top 50 words used in First Thing to Notice About Section (Females)") +
  theme(legend.position = "none", axis.text.x = element_text(angle = 90))

3.1.5 Favorite Hobbies:

hobbiesWords_F <- females_txt$fav_hobbies %>% str_replace_all("[[:punct:]]", "") %>% 
  str_match_all("\\S+") %>% 
  unlist() %>% as.data.frame() %>% setNames("Words") %>% filter(!str_detect(Words, "\\d")) %>%
  filter(str_detect(Words, "[:alpha:]")) %>%
  group_by(Words) %>% summarize(count = n()) %>% anti_join(stopWords) %>%
  mutate(prop = count/sum(count)) %>% 
  arrange(desc(count)) %>% print(n = 10)

## # A tibble: 8,374 × 3
##    Words    count    prop
##    <chr>    <int>   <dbl>
##  1 love       480 0.0175 
##  2 music      441 0.0161 
##  3 food       420 0.0153 
##  4 movies     360 0.0131 
##  5 books      310 0.0113 
##  6 favorite   243 0.00887
##  7 im         240 0.00876
##  8 read       155 0.00565
##  9 dont       134 0.00489
## 10 time       121 0.00441
## # … with 8,364 more rows

hobbiesWords_F %>% head(50) %>% mutate(Words = fct_reorder(Words, count)) %>%
  ggplot(aes(x = count, y = Words, fill = Words)) + 
  geom_bar(stat = "identity") +
  theme_cowplot() +
  scale_fill_viridis(discrete = TRUE) +
  labs(title = "Top 50 words used in Hobbies Section (Females)") +
  theme(legend.position = "none", axis.text.x = element_text(angle = 90))

3.1.6 Six things:

six_thingsWords_F <- females_txt$six_things %>% str_replace_all("[[:punct:]]", "") %>% 
  str_match_all("\\S+") %>% 
  unlist() %>% as.data.frame() %>% setNames("Words") %>% filter(!str_detect(Words, "\\d")) %>%
  filter(str_detect(Words, "[:alpha:]")) %>%
  group_by(Words) %>% summarize(count = n()) %>% anti_join(stopWords) %>%
  mutate(prop = count/sum(count)) %>% 
  arrange(desc(count)) %>% print(n = 10)

## # A tibble: 2,190 × 3
##    Words    count    prop
##    <chr>    <int>   <dbl>
##  1 friends    184 0.0339 
##  2 family     158 0.0291 
##  3 music      126 0.0232 
##  4 love        94 0.0173 
##  5 food        82 0.0151 
##  6 water       60 0.0110 
##  7 books       45 0.00828
##  8 coffee      42 0.00773
##  9 phone       41 0.00754
## 10 laughter    39 0.00718
## # … with 2,180 more rows

six_thingsWords_F %>% head(50) %>% mutate(Words = fct_reorder(Words, count)) %>%
  ggplot(aes(x = count, y = Words, fill = Words)) + 
  geom_bar(stat = "identity") +
  theme_cowplot() +
  scale_fill_viridis(discrete = TRUE) +
  labs(title = "Top 50 words used in 6 Things Section (Females)") +
  theme(legend.position = "none", axis.text.x = element_text(angle = 90))

3.1.7 I think a lot about:

think_abtWords_F <- females_txt$think_abt %>% str_replace_all("[[:punct:]]", "") %>% 
  str_match_all("\\S+") %>% 
  unlist() %>% as.data.frame() %>% setNames("Words") %>% filter(!str_detect(Words, "\\d")) %>%
  filter(str_detect(Words, "[:alpha:]")) %>%
  group_by(Words) %>% summarize(count = n()) %>% anti_join(stopWords) %>%
  mutate(prop = count/sum(count)) %>% 
  arrange(desc(count)) %>% print(n = 10)

## # A tibble: 2,445 × 3
##    Words    count    prop
##    <chr>    <int>   <dbl>
##  1 life       113 0.0223 
##  2 im          89 0.0175 
##  3 people      72 0.0142 
##  4 time        68 0.0134 
##  5 world       60 0.0118 
##  6 future      59 0.0116 
##  7 love        59 0.0116 
##  8 lot         54 0.0106 
##  9 thinking    54 0.0106 
## 10 friends     28 0.00552
## # … with 2,435 more rows

think_abtWords_F %>% head(50) %>% mutate(Words = fct_reorder(Words, count)) %>%
  ggplot(aes(x = count, y = Words, fill = Words)) + 
  geom_bar(stat = "identity") +
  theme_cowplot() +
  scale_fill_viridis(discrete = TRUE) +
  labs(title = "Top 50 words used in Think About Section (Females)") +
  theme(legend.position = "none", axis.text.x = element_text(angle = 90))

3.1.8 On a typical Friday night:

friday_nightWords_F <- females_txt$friday_night_thing %>% str_replace_all("[[:punct:]]", "") %>% 
  str_match_all("\\S+") %>% 
  unlist() %>% as.data.frame() %>% setNames("Words") %>% filter(!str_detect(Words, "\\d")) %>%
  filter(str_detect(Words, "[:alpha:]")) %>%
  group_by(Words) %>% summarize(count = n()) %>% anti_join(stopWords) %>%
  mutate(prop = count/sum(count)) %>% 
  arrange(desc(count)) %>% print(n = 10)

## # A tibble: 1,631 × 3
##    Words    count   prop
##    <chr>    <int>  <dbl>
##  1 friends    183 0.0407
##  2 home       130 0.0289
##  3 watching    85 0.0189
##  4 movie       78 0.0174
##  5 im          77 0.0171
##  6 dinner      71 0.0158
##  7 friday      69 0.0154
##  8 night       64 0.0143
##  9 dancing     49 0.0109
## 10 hanging     45 0.0100
## # … with 1,621 more rows

friday_nightWords_F %>% head(50) %>% mutate(Words = fct_reorder(Words, count)) %>%
  ggplot(aes(x = count, y = Words, fill = Words)) + 
  geom_bar(stat = "identity") +
  theme_cowplot() +
  scale_fill_viridis(discrete = TRUE) +
  labs(title = "Top 50 words used in Friday Night Section (Females)") +
  theme(legend.position = "none", axis.text.x = element_text(angle = 90))

3.1.9 Most private thing I can reveal:

pvt_thingWords_F <- females_txt$most_pvt_thing %>% str_replace_all("[[:punct:]]", "") %>% 
  str_match_all("\\S+") %>% 
  unlist() %>% as.data.frame() %>% setNames("Words") %>% filter(!str_detect(Words, "\\d")) %>%
  filter(str_detect(Words, "[:alpha:]")) %>%
  group_by(Words) %>% summarize(count = n()) %>% 
  anti_join(stopWords) %>%
  mutate(prop = count/sum(count)) %>% 
  arrange(desc(count)) %>% print(n = 10)

## # A tibble: 1,997 × 3
##    Words   count    prop
##    <chr>   <int>   <dbl>
##  1 im        172 0.0475 
##  2 dont       55 0.0152 
##  3 love       50 0.0138 
##  4 private    47 0.0130 
##  5 ive        40 0.0110 
##  6 ill        29 0.00801
##  7 time       26 0.00718
##  8 admit      24 0.00663
##  9 people     24 0.00663
## 10 person     20 0.00552
## # … with 1,987 more rows

pvt_thingWords_F %>% head(50) %>% mutate(Words = fct_reorder(Words, count)) %>%
  ggplot(aes(x = count, y = Words, fill = Words)) + 
  geom_bar(stat = "identity") +
  theme_cowplot() +
  scale_fill_viridis(discrete = TRUE) +
  labs(title = "Top 50 words used in Most Private Thing Section (Females)") +
  theme(legend.position = "none", axis.text.x = element_text(angle = 90))

3.1.10 Message me if:

msg_meWords_F <- females_txt$msg_me_if %>% str_replace_all("[[:punct:]]", "") %>% 
  str_match_all("\\S+") %>% 
  unlist() %>% as.data.frame() %>% setNames("Words") %>% filter(!str_detect(Words, "\\d")) %>%
  filter(str_detect(Words, "[:alpha:]")) %>%
  group_by(Words) %>% summarize(count = n()) %>% anti_join(stopWords) %>%
  mutate(prop = count/sum(count)) %>% 
  arrange(desc(count)) %>% print(n = 10)

## # A tibble: 2,778 × 3
##    Words   count    prop
##    <chr>   <int>   <dbl>
##  1 youre     164 0.0230 
##  2 dont      107 0.0150 
##  3 im        103 0.0144 
##  4 life       84 0.0118 
##  5 fun        71 0.00995
##  6 people     63 0.00883
##  7 love       61 0.00855
##  8 message    59 0.00827
##  9 enjoy      46 0.00645
## 10 person     46 0.00645
## # … with 2,768 more rows

msg_meWords_F %>% head(50) %>% mutate(Words = fct_reorder(Words, count)) %>%
  ggplot(aes(x = count, y = Words, fill = Words)) + 
  geom_bar(stat = "identity") +
  theme_cowplot() +
  scale_fill_viridis(discrete = TRUE) +
  labs(title = "Top 50 words used in Message Me If Section (Females)") +
  theme(legend.position = "none", axis.text.x = element_text(angle = 90))

3.2 Males

Let’s do some text analysis on the males column

males_txt <- males %>% dplyr::select(age, income, edu_lvl, job, 22:31)

3.2.1 Summary:

summaryWords_M <- males_txt$self_summary %>% 
  str_replace_all("[[:punct:]]", "") %>% str_match_all("\\S+") %>%  
  unlist() %>% as.data.frame() %>% setNames("Words") %>% filter(!str_detect(Words, "\\d")) %>%
  filter(str_detect(Words, "[:alnum:]")) %>%
  group_by(Words) %>% summarize(count = n()) %>% anti_join(stopWords) %>%
  mutate(prop = count/sum(count)) %>% 
  arrange(desc(count)) %>% print(n = 10)

## # A tibble: 10,762 × 3
##    Words   count    prop
##    <chr>   <int>   <dbl>
##  1 im       1660 0.0361 
##  2 love      787 0.0171 
##  3 life      526 0.0114 
##  4 people    462 0.0100 
##  5 time      389 0.00845
##  6 dont      382 0.00830
##  7 ive       272 0.00591
##  8 friends   266 0.00578
##  9 enjoy     254 0.00552
## 10 fun       218 0.00474
## # … with 10,752 more rows

summaryWords_M %>% head(50) %>% mutate(Words = fct_reorder(Words, count)) %>%
  ggplot(aes(x = count, y = Words, fill = Words)) + 
  geom_bar(stat = "identity") +
  theme_cowplot() +
  scale_fill_viridis(discrete = TRUE) +
  labs(title = "Top 50 words used in Summary Section (Males)") +
  theme(legend.position = "none", axis.text.x = element_text(angle = 90))

3.2.2 Doing with life:

# 2. Doing with life:
lifeWords_M <- males_txt$doing_with_my_life %>% str_replace_all("[[:punct:]]", "") %>% 
  str_match_all("\\S+") %>% 
  unlist() %>% as.data.frame() %>% setNames("Words") %>% filter(!str_detect(Words, "\\d")) %>%
  filter(str_detect(Words, "[:alpha:]")) %>%
  group_by(Words) %>% summarize(count = n()) %>% anti_join(stopWords) %>%
  mutate(prop = count/sum(count)) %>% 
  arrange(desc(count)) %>% print(n = 10)

## # A tibble: 5,751 × 3
##    Words   count    prop
##    <chr>   <int>   <dbl>
##  1 im        640 0.0339 
##  2 time      234 0.0124 
##  3 life      220 0.0117 
##  4 love      194 0.0103 
##  5 job       140 0.00742
##  6 friends   138 0.00731
##  7 school    131 0.00694
##  8 ive       117 0.00620
##  9 people    116 0.00615
## 10 day       106 0.00562
## # … with 5,741 more rows

lifeWords_M %>% head(50) %>% mutate(Words = fct_reorder(Words, count)) %>%
  ggplot(aes(x = count, y = Words, fill = Words)) + 
  geom_bar(stat = "identity") +
  theme_cowplot() +
  scale_fill_viridis(discrete = TRUE) +
  labs(title = "Top 50 words used in Doing with My Life Section (Males)") +
  theme(legend.position = "none", axis.text.x = element_text(angle = 90))

3.2.3 Good at:

# 3. Good at:
good_atWords_M <- males_txt$good_at %>% str_replace_all("[[:punct:]]", "") %>% 
  str_match_all("\\S+") %>% 
  unlist() %>% as.data.frame() %>% setNames("Words") %>% filter(!str_detect(Words, "\\d")) %>%
  filter(str_detect(Words, "[:alpha:]")) %>%
  group_by(Words) %>% summarize(count = n()) %>% anti_join(stopWords) %>%
  mutate(prop = count/sum(count)) %>% 
  arrange(desc(count)) %>% print(n = 10)

## # A tibble: 4,725 × 3
##    Words     count    prop
##    <chr>     <int>   <dbl>
##  1 im          388 0.0289 
##  2 people      300 0.0223 
##  3 cooking     139 0.0103 
##  4 listening   130 0.00967
##  5 laugh        99 0.00736
##  6 pretty       99 0.00736
##  7 love         98 0.00729
##  8 playing      84 0.00625
##  9 games        67 0.00498
## 10 music        67 0.00498
## # … with 4,715 more rows

good_atWords_M %>% head(50) %>% mutate(Words = fct_reorder(Words, count)) %>%
  ggplot(aes(x = count, y = Words, fill = Words)) + 
  geom_bar(stat = "identity") +
  theme_cowplot() +
  scale_fill_viridis(discrete = TRUE) +
  labs(title = "Top 50 words used in Good At Section (Males)") +
  theme(legend.position = "none", axis.text.x = element_text(angle = 90))

3.2.4 First thing people notice about me:

# 4. First thing people notice about me:
abt_meWords_M <- males_txt$first_notice_abt_me %>% str_replace_all("[[:punct:]]", "") %>% 
  str_match_all("\\S+") %>% 
  unlist() %>% as.data.frame() %>% setNames("Words") %>% filter(!str_detect(Words, "\\d")) %>%
  filter(str_detect(Words, "[:alpha:]")) %>%
  group_by(Words) %>% summarize(count = n()) %>% anti_join(stopWords) %>%
  mutate(prop = count/sum(count)) %>% 
  arrange(desc(count)) %>% print(n = 10)

## # A tibble: 2,444 × 3
##    Words  count    prop
##    <chr>  <int>   <dbl>
##  1 im       386 0.0565 
##  2 eyes     193 0.0282 
##  3 people   191 0.0279 
##  4 smile    165 0.0241 
##  5 hair     118 0.0173 
##  6 notice    93 0.0136 
##  7 dont      87 0.0127 
##  8 humor     67 0.00980
##  9 lot       60 0.00878
## 10 sense     58 0.00849
## # … with 2,434 more rows

abt_meWords_M %>% head(50) %>% mutate(Words = fct_reorder(Words, count)) %>%
  ggplot(aes(x = count, y = Words, fill = Words)) + 
  geom_bar(stat = "identity") +
  theme_cowplot() +
  scale_fill_viridis(discrete = TRUE) +
  labs(title = "Top 50 words used in First Thing to Notice About Section (Males)") +
  theme(legend.position = "none", axis.text.x = element_text(angle = 90))

3.2.5 Favorite Hobbies:

# 5. Favorite Hobbies:
hobbiesWords_M <- males_txt$fav_hobbies %>% str_replace_all("[[:punct:]]", "") %>% 
  str_match_all("\\S+") %>% 
  unlist() %>% as.data.frame() %>% setNames("Words") %>% filter(!str_detect(Words, "\\d")) %>%
  filter(str_detect(Words, "[:alpha:]")) %>%
  group_by(Words) %>% summarize(count = n()) %>% anti_join(stopWords) %>%
  mutate(prop = count/sum(count)) %>% 
  arrange(desc(count)) %>% print(n = 10)

## # A tibble: 12,843 × 3
##    Words    count    prop
##    <chr>    <int>   <dbl>
##  1 music      793 0.0147 
##  2 food       757 0.0141 
##  3 movies     706 0.0131 
##  4 books      604 0.0112 
##  5 love       559 0.0104 
##  6 im         444 0.00826
##  7 favorite   366 0.00680
##  8 dont       257 0.00478
##  9 read       234 0.00435
## 10 tv         233 0.00433
## # … with 12,833 more rows

hobbiesWords_M %>% head(50) %>% mutate(Words = fct_reorder(Words, count)) %>%
  ggplot(aes(x = count, y = Words, fill = Words)) + 
  geom_bar(stat = "identity") +
  theme_cowplot() +
  scale_fill_viridis(discrete = TRUE) +
  labs(title = "Top 50 words used in Hobbies Section (Males)") +
  theme(legend.position = "none", axis.text.x = element_text(angle = 90))

3.2.6 Six things:

# 6. Six things:
six_thingsWords_M <- males_txt$six_things %>% str_replace_all("[[:punct:]]", "") %>% 
  str_match_all("\\S+") %>% 
  unlist() %>% as.data.frame() %>% setNames("Words") %>% filter(!str_detect(Words, "\\d")) %>%
  filter(str_detect(Words, "[:alpha:]")) %>%
  group_by(Words) %>% summarize(count = n()) %>% anti_join(stopWords) %>%
  mutate(prop = count/sum(count)) %>% 
  arrange(desc(count)) %>% print(n = 10)

## # A tibble: 3,611 × 3
##    Words    count    prop
##    <chr>    <int>   <dbl>
##  1 friends    341 0.0325 
##  2 music      258 0.0246 
##  3 family     255 0.0243 
##  4 food       207 0.0198 
##  5 love       142 0.0136 
##  6 water      100 0.00954
##  7 phone       88 0.00840
##  8 internet    80 0.00764
##  9 im          77 0.00735
## 10 books       72 0.00687
## # … with 3,601 more rows

six_thingsWords_M %>% head(50) %>% mutate(Words = fct_reorder(Words, count)) %>%
  ggplot(aes(x = count, y = Words, fill = Words)) + 
  geom_bar(stat = "identity") +
  theme_cowplot() +
  scale_fill_viridis(discrete = TRUE) +
  labs(title = "Top 50 words used in 6 Things Section (Males)") +
  theme(legend.position = "none", axis.text.x = element_text(angle = 90))

3.2.7 I think a lot about:

# 7. I think a lot about:
think_abtWords_M <- males_txt$think_abt %>% str_replace_all("[[:punct:]]", "") %>% 
  str_match_all("\\S+") %>% 
  unlist() %>% as.data.frame() %>% setNames("Words") %>% filter(!str_detect(Words, "\\d")) %>%
  filter(str_detect(Words, "[:alpha:]")) %>%
  group_by(Words) %>% summarize(count = n()) %>% anti_join(stopWords) %>%
  mutate(prop = count/sum(count)) %>% 
  arrange(desc(count)) %>% print(n = 10)

## # A tibble: 4,106 × 3
##    Words    count    prop
##    <chr>    <int>   <dbl>
##  1 life       229 0.0230 
##  2 im         178 0.0179 
##  3 people     149 0.0150 
##  4 future     125 0.0126 
##  5 time       125 0.0126 
##  6 world      115 0.0115 
##  7 thinking   111 0.0111 
##  8 lot         85 0.00853
##  9 friends     60 0.00602
## 10 spend       60 0.00602
## # … with 4,096 more rows

think_abtWords_M %>% head(50) %>% mutate(Words = fct_reorder(Words, count)) %>%
  ggplot(aes(x = count, y = Words, fill = Words)) + 
  geom_bar(stat = "identity") +
  theme_cowplot() +
  scale_fill_viridis(discrete = TRUE) +
  labs(title = "Top 50 words used in Think About Section (Males)") +
  theme(legend.position = "none", axis.text.x = element_text(angle = 90))

3.2.8 On a typical Friday night:

# 8. On a typical Friday night:
friday_nightWords_M <- males_txt$friday_night_thing %>% str_replace_all("[[:punct:]]", "") %>% 
  str_match_all("\\S+") %>% 
  unlist() %>% as.data.frame() %>% setNames("Words") %>% filter(!str_detect(Words, "\\d")) %>%
  filter(str_detect(Words, "[:alpha:]")) %>%
  group_by(Words) %>% summarize(count = n()) %>% anti_join(stopWords) %>%
  mutate(prop = count/sum(count)) %>% 
  arrange(desc(count)) %>% print(n = 10)

## # A tibble: 2,721 × 3
##    Words    count    prop
##    <chr>    <int>   <dbl>
##  1 friends    378 0.0441 
##  2 home       174 0.0203 
##  3 im         171 0.0200 
##  4 watching   128 0.0149 
##  5 movie      116 0.0135 
##  6 friday     115 0.0134 
##  7 hanging    115 0.0134 
##  8 night      110 0.0128 
##  9 dinner     101 0.0118 
## 10 time        81 0.00945
## # … with 2,711 more rows

friday_nightWords_M %>% head(50) %>% mutate(Words = fct_reorder(Words, count)) %>%
  ggplot(aes(x = count, y = Words, fill = Words)) + 
  geom_bar(stat = "identity") +
  theme_cowplot() +
  scale_fill_viridis(discrete = TRUE) +
  labs(title = "Top 50 words used in Friday Night Section (Males)") +
  theme(legend.position = "none", axis.text.x = element_text(angle = 90))

3.2.9 Most private thing I can reveal:

# 9. Most private thing I can reveal:
pvt_thingWords_M <- males_txt$most_pvt_thing %>% str_replace_all("[[:punct:]]", "") %>% 
  str_match_all("\\S+") %>% 
  unlist() %>% as.data.frame() %>% setNames("Words") %>% filter(!str_detect(Words, "\\d")) %>%
  filter(str_detect(Words, "[:alpha:]")) %>%
  group_by(Words) %>% summarize(count = n()) %>% anti_join(stopWords) %>%
  mutate(prop = count/sum(count)) %>% 
  arrange(desc(count)) %>% print(n = 10)

## # A tibble: 3,400 × 3
##    Words   count    prop
##    <chr>   <int>   <dbl>
##  1 im        294 0.0403 
##  2 dont       98 0.0134 
##  3 love       98 0.0134 
##  4 people     76 0.0104 
##  5 private    76 0.0104 
##  6 ive        63 0.00863
##  7 time       59 0.00808
##  8 admit      53 0.00726
##  9 ill        53 0.00726
## 10 life       35 0.00479
## # … with 3,390 more rows

pvt_thingWords_M %>% head(50) %>% mutate(Words = fct_reorder(Words, count)) %>%
  ggplot(aes(x = count, y = Words, fill = Words)) + 
  geom_bar(stat = "identity") +
  theme_cowplot() +
  scale_fill_viridis(discrete = TRUE) +
  labs(title = "Top 50 words used in Most Private Thing Section (Males)") +
  theme(legend.position = "none", axis.text.x = element_text(angle = 90))

3.2.10 Message me if:

msg_meWords_M <- males_txt$msg_me_if %>% str_replace_all("[[:punct:]]", "") %>% 
  str_match_all("\\S+") %>% 
  unlist() %>% as.data.frame() %>% setNames("Words") %>% filter(!str_detect(Words, "\\d")) %>%
  filter(str_detect(Words, "[:alpha:]")) %>%
  group_by(Words) %>% summarize(count = n()) %>% anti_join(stopWords) %>% 
  mutate(prop = count/sum(count)) %>% 
  arrange(desc(count)) %>% print(n = 10)

## # A tibble: 4,056 × 3
##    Words   count    prop
##    <chr>   <int>   <dbl>
##  1 youre     303 0.0251 
##  2 dont      176 0.0146 
##  3 im        174 0.0144 
##  4 fun       143 0.0118 
##  5 love      132 0.0109 
##  6 message   124 0.0103 
##  7 life      116 0.00959
##  8 time       98 0.00811
##  9 meet       89 0.00736
## 10 enjoy      87 0.00720
## # … with 4,046 more rows

msg_meWords_M %>% head(50) %>% mutate(Words = fct_reorder(Words, count)) %>%
  ggplot(aes(x = count, y = Words, fill = Words)) + 
  geom_bar(stat = "identity") +
  theme_cowplot() +
  scale_fill_viridis(discrete = TRUE) +
  labs(title = "Top 50 words used in Message Me If Section (Males)") +
  theme(legend.position = "none", axis.text.x = element_text(angle = 90))

Let’s do a proportional study, to understand what is the affinity of usage of words by each gender in each of the 10 sections.We’ll consider Females as the target group, and see what is their affinity to use a certain word relative to their male counterparts

4 Proportional Analysis of Word Usage

4.1 Females

4.1.1 Summary

inner_join(summaryWords_F, summaryWords_M, by = "Words") %>% 
  rename(prop_f = prop.x, prop_m = prop.y) %>% mutate(affinityToWords = prop_f/prop_m) %>%
  dplyr::select(Words, prop_f, prop_m, affinityToWords) %>% arrange(desc(affinityToWords))

Women like to use the words “dreamer”, “piercing” almost 1350% more than men…

4.1.2 Doing with life:

inner_join(lifeWords_F, lifeWords_M, by = "Words") %>% 
  rename(prop_f = prop.x, prop_m = prop.y) %>% mutate(affinityToWords = prop_f/prop_m) %>%
  dplyr::select(Words, prop_f, prop_m, affinityToWords) %>% arrange(desc(affinityToWords))

4.1.3 Good at:

inner_join(good_atWords_F, good_atWords_M, by = "Words") %>% 
  rename(prop_f = prop.x, prop_m = prop.y) %>% mutate(affinityToWords = prop_f/prop_m) %>%
  dplyr::select(Words, prop_f, prop_m, affinityToWords) %>% arrange(desc(affinityToWords))

4.1.4 First thing people notice about me:

inner_join(abt_meWords_F, abt_meWords_M, by = "Words") %>% 
  rename(prop_f = prop.x, prop_m = prop.y) %>% mutate(affinityToWords = prop_f/prop_m) %>%
  dplyr::select(Words, prop_f, prop_m, affinityToWords) %>% arrange(desc(affinityToWords))

4.1.5 Favorite Hobbies:

inner_join(hobbiesWords_F, hobbiesWords_M, by = "Words") %>% 
  rename(prop_f = prop.x, prop_m = prop.y) %>% mutate(affinityToWords = prop_f/prop_m) %>%
  dplyr::select(Words, prop_f, prop_m, affinityToWords) %>% arrange(desc(affinityToWords))

4.1.6 Six things:

inner_join(six_thingsWords_F, six_thingsWords_M, by = "Words") %>% 
  rename(prop_f = prop.x, prop_m = prop.y) %>% mutate(affinityToWords = prop_f/prop_m) %>%
  dplyr::select(Words, prop_f, prop_m, affinityToWords) %>% arrange(desc(affinityToWords))

4.1.7 I think a lot about:

inner_join(think_abtWords_F, think_abtWords_M, by = "Words") %>% 
  rename(prop_f = prop.x, prop_m = prop.y) %>% mutate(affinityToWords = prop_f/prop_m) %>%
  dplyr::select(Words, prop_f, prop_m, affinityToWords) %>% arrange(desc(affinityToWords))

4.1.8 Most private thing I can reveal:

inner_join(pvt_thingWords_F, pvt_thingWords_M, by = "Words") %>% 
  rename(prop_f = prop.x, prop_m = prop.y) %>% mutate(affinityToWords = prop_f/prop_m) %>%
  dplyr::select(Words, prop_f, prop_m, affinityToWords) %>% arrange(desc(affinityToWords))

4.1.9 Message me if:

inner_join(msg_meWords_F, msg_meWords_M, by = "Words") %>% 
  rename(prop_f = prop.x, prop_m = prop.y) %>% mutate(affinityToWords = prop_f/prop_m) %>%
  dplyr::select(Words, prop_f, prop_m, affinityToWords) %>% arrange(desc(affinityToWords))

4.2 Males

4.2.1 Summary

inner_join(summaryWords_F, summaryWords_M, by = "Words") %>% 
  rename(prop_f = prop.x, prop_m = prop.y) %>% mutate(affinityToWords = prop_m/prop_f) %>%
  dplyr::select(Words, prop_f, prop_m, affinityToWords) %>% arrange(desc(affinityToWords))

4.2.2 Doing with life:

inner_join(lifeWords_F, lifeWords_M, by = "Words") %>% 
  rename(prop_f = prop.x, prop_m = prop.y) %>% mutate(affinityToWords = prop_m/prop_f) %>%
  dplyr::select(Words, prop_f, prop_m, affinityToWords) %>% arrange(desc(affinityToWords))

4.2.3 Good at:

inner_join(good_atWords_F, good_atWords_M, by = "Words") %>% 
  rename(prop_f = prop.x, prop_m = prop.y) %>% mutate(affinityToWords = prop_m/prop_f) %>%
  dplyr::select(Words, prop_f, prop_m, affinityToWords) %>% arrange(desc(affinityToWords))

4.2.4 First thing people notice about me:

inner_join(abt_meWords_F, abt_meWords_M, by = "Words") %>% 
  rename(prop_f = prop.x, prop_m = prop.y) %>% mutate(affinityToWords = prop_m/prop_f) %>%
  dplyr::select(Words, prop_f, prop_m, affinityToWords) %>% arrange(desc(affinityToWords))

4.2.5 Favorite Hobbies:

inner_join(hobbiesWords_F, hobbiesWords_M, by = "Words") %>% 
  rename(prop_f = prop.x, prop_m = prop.y) %>% mutate(affinityToWords = prop_m/prop_f) %>%
  dplyr::select(Words, prop_f, prop_m, affinityToWords) %>% arrange(desc(affinityToWords))

4.2.6 Six things:

inner_join(six_thingsWords_F, six_thingsWords_M, by = "Words") %>% 
  rename(prop_f = prop.x, prop_m = prop.y) %>% mutate(affinityToWords = prop_m/prop_f) %>%
  dplyr::select(Words, prop_f, prop_m, affinityToWords) %>% arrange(desc(affinityToWords))

4.2.7 I think a lot about:

inner_join(think_abtWords_F, think_abtWords_M, by = "Words") %>% 
  rename(prop_f = prop.x, prop_m = prop.y) %>% mutate(affinityToWords = prop_m/prop_f) %>%
  dplyr::select(Words, prop_f, prop_m, affinityToWords) %>% arrange(desc(affinityToWords))

4.2.8 Most private thing I can reveal:

inner_join(pvt_thingWords_F, pvt_thingWords_M, by = "Words") %>% 
  rename(prop_f = prop.x, prop_m = prop.y) %>% mutate(affinityToWords = prop_m/prop_f) %>%
  dplyr::select(Words, prop_f, prop_m, affinityToWords) %>% arrange(desc(affinityToWords))

4.2.9 Message me if:

inner_join(msg_meWords_F, msg_meWords_M, by = "Words") %>% 
  rename(prop_f = prop.x, prop_m = prop.y) %>% mutate(affinityToWords = prop_m/prop_f) %>%
  dplyr::select(Words, prop_f, prop_m, affinityToWords) %>% arrange(desc(affinityToWords))

OkCupid Text Analysis

Shafqat Shafiq

2023-01-15