## Warning: package 'ggplot2' was built under R version 3.2.4

Admistrative:

Please indicate

Question 1:

Perform an Exploratory Data Analysis (EDA) on the profiles data set, specifically on the relationship between gender and

all keeping in mind in HW-3, you will be fitting a logistic regression to predict a user’s gender based on these variables.

Income

The following will analyze the relationship between gender and income.

profiles <- profiles %>%
  mutate(is_female = ifelse(sex=="f", 1, 0))

profiles <- profiles %>%
  mutate(income_bracket = ifelse(income == -1, "unreported", 
                                 ifelse(income <= 50000, "under_50k",
                                        ifelse(income <= 150000, "50k_to_150k",
                                               ifelse(income <= 10000000, 
                                                      "150k_to_10mil", "over_10mil")))))

compare_incomes <- profiles %>%
  group_by(income_bracket, sex) %>%
  summarise(count=n()) %>%
  group_by(income_bracket) %>%
  mutate(prop=count/sum(count))

compare_incomes %>% knitr::kable(digits = 4)
income_bracket sex count prop
150k_to_10mil f 133 0.1852
150k_to_10mil m 585 0.8148
50k_to_150k f 979 0.2037
50k_to_150k m 3827 0.7963
under_50k f 2001 0.3346
under_50k m 3979 0.6654
unreported f 21004 0.4336
unreported m 27438 0.5664
ggplot(compare_incomes, aes(x = income_bracket, y = prop, fill = sex)) +
  geom_bar(stat="identity", position = "dodge") +
  ggtitle("Self reported incomes of SF OkCupid users")

# income_hist<- profiles %>% 
#   filter(income != -1) %>%
#   ggplot(aes(x = income)) +
#   geom_histogram(bins = 30) +
#   facet_wrap(~sex) +
#   ggtitle("Self reported income by gender in SF OkCupid users")
# income_hist

profiles %>% filter(income!= -1) %>% summarise(mean(income))
## Source: local data frame [1 x 1]
## 
##   mean(income)
##          (dbl)
## 1       104395
no_reported_income <-  profiles %>% 
  filter(income == -1) %>%
  summarise(mean(sex == 'f'), mean(sex == 'm'))

no_reported_income %>% knitr::kable(digits = 4, 
                                    caption = "Gender breakdown for missing income values",
                                    col.names = c("proportion female", "proportion male"))
Gender breakdown for missing income values
proportion female proportion male
0.4336 0.5664
mean(profiles$income == -1)
## [1] 0.808094
not_reporting_income <- profiles %>%
  group_by(sex) %>%
  summarise("Proportion not reporting income" = mean(income == -1))
not_reporting_income %>% knitr::kable(digits = 2)
sex Proportion not reporting income
f 0.87
m 0.77

80.8% of users did note report an income. 87.09% of women didn’t report and 76.58% of men didn’t report. The “unreported” category is the only category in which females make up more that 40% of the category, even though 40% of users in the overall data are female.

Job

The following will analyze the relationship between gender and job.

n_distinct(profiles$job)
## [1] 22
ggplot(profiles, aes(x=job)) +geom_bar(stat="count")

test <- profiles %>% 
  group_by(job) %>%
  summarise(count = n()) %>%
  ungroup()%>%
  mutate(job_new =ifelse(count < 600, "misc", 
                         ifelse(job == "", "unreported", job)))

profiles <- left_join( profiles, test, by = "job") %>% select(-count)

jobs_hist<- profiles %>% 
  group_by(job_new, sex) %>% 
  summarise(count=n()) %>% 
  group_by(job_new) %>% 
  mutate(prop=count/sum(count))

ggplot(jobs_hist, aes(x = job_new, y=prop, fill= sex)) +
  geom_bar(stat="identity", position = "dodge") +
  ggtitle("Jobs of SF OkCupid users") +
  theme(axis.text.x=element_text(angle=90))

profiles %>% group_by(job_new) %>% 
  summarise(percent_female = mean(is_female)) %>% 
  knitr::kable(digits = 4)
job_new percent_female
artistic / musical / writer 0.4240
banking / financial / real estate 0.3486
clerical / administrative 0.7106
computer / hardware / software 0.1361
construction / craftsmanship 0.0793
education / academia 0.6134
entertainment / media 0.3027
executive / management 0.3182
hospitality / travel 0.4179
law / legal services 0.4634
medicine / health 0.5943
misc 0.3074
other 0.4802
political / government 0.4379
sales / marketing / biz dev 0.4363
science / tech / engineering 0.2096
student 0.4406
unreported 0.4460
not_reporting_job <- profiles %>%
  group_by(sex) %>%
  summarise("Proportion not reporting job" = mean(job == "")) %>%
  knitr::kable(digits = 2)
not_reporting_job
sex Proportion not reporting job
f 0.15
m 0.13
sum(profiles$job == "")/sum(!is.na(profiles$job)) #overall not reporting job
## [1] 0.1367564
job_reporting_other <- profiles %>%
  group_by(sex) %>%
  summarise("Percent reporting job as other" = mean(job == "other")) %>%
  knitr::kable(digits = 2)
job_reporting_other
sex Percent reporting job as other
f 0.15
m 0.11
sum(profiles$job == "other")/sum(!is.na(profiles$job))
## [1] 0.1265973
computer_dudes <- profiles %>%
  group_by(sex) %>% 
  summarise("percent in computers" = mean(job == "computer / hardware / software"))

computer_dudes %>% knitr::kable(digits=4)
sex percent in computers
f 0.0266
m 0.1135
tech_dudes <- profiles %>%
  group_by(sex) %>% 
  summarise("percent in science/tech/engineering" = 
              mean(job == "science / tech / engineering"))

tech_dudes %>% knitr::kable(digits=4)
sex percent in science/tech/engineering
f 0.0421
m 0.1070

Overall, 13.7% of users did not report a job. 15.16% of females and 12.68% of males didn’t report. 12.6% of users fall under the category other; 15% of females and 11% of males. Also, 11.3% of males are in computers/hardware/software, while only 2.6% of women are. Similar numbers for the STEM category. This mean that 21% of men are in one of these 2 categories but only 6% of women are.

Also, I lumped all jobs that constituted less than 1% of users (count < 600) into one category “misc.”

Orientation

n_distinct(profiles$orientation)
## [1] 3
sexual_orientation_bar <- profiles %>%
  group_by(orientation, sex) %>%
  summarise(count = n()) %>%
  group_by(orientation) %>%
  mutate(prop = count/sum(count))

ggplot(sexual_orientation_bar, aes(x = orientation, y = prop, fill = sex)) +
  geom_bar(position = "dodge", stat="identity") +
  ggtitle("Sexual Orientation in SF OKCupid Users")

orientation <-  profiles %>% 
  group_by(orientation) %>%
  summarise("female" = mean(sex == 'f'),
            "male" = mean(sex == 'm')) %>%
  knitr::kable(digits=4)
orientation
orientation female male
bisexual 0.7214 0.2786
gay 0.2849 0.7151
straight 0.3979 0.6021

Although only 40% of users are female, 72% of users who are bisexual are women.

Question 2:

In the file HW-2_Shiny_App.Rmd, build the Shiny App discussed in Lec09 on Monday 10/3: Using the movies data set in the ggplot2movies data set, make a Shiny app that