Disclaimer: This page documents the processes and results of the revision of a working paper in preparation for journal submission. All items on this page are preliminary and subject to change, except those are specifically mentioned as finalized. If you have any question, contact Yongsung Lee via yongsung.lee(at)gatch.edu.
install.packages("tidyverse", repos = "http://cloud.r-project.org/", dependencies = TRUE)
#install.packages("readxl", repos = "http://cloud.r-project.org/", dependencies = TRUE)
install.packages("devtools", repos = "http://cloud.r-project.org/", dependencies = TRUE)
devtools::install_github("tidyverse/readxl")
install.packages("SDMTools", repos = "http://cloud.r-project.org/", dependencies = TRUE)
#install.packages("RColorBrewer", repos = "http://cloud.r-project.org/", dependencies = TRUE)
install.packages("foreign", repos = "http://cloud.r-project.org/", dependencies = TRUE)

library(tidyverse)
library(readxl)
library(SDMTools)
#library(RColorBrewer)
library(foreign)

1. Compute the probabilties of choosing alternatives conditional on class membership

1.1. Extract choice model coefficients: three sets, each set for three latent classes
choice00 <- read_excel("data/Model03_LCCMtable.xlsx", sheet = "choice")

choice01 <- choice00 %>%
  select("Model for Choices", X__1, X__4, X__7) %>% 
  filter(!is.na("Model for Choices") == TRUE, !is.na(X__7) == TRUE, X__1 != "Class1") 

choice02 <- tibble(
  var = choice01[["Model for Choices"]][1:18], 
  coef.choice.1 = as.numeric(choice01$X__1[1:18]), 
  coef.choice.2 = as.numeric(choice01$X__4[1:18]), 
  coef.choice.3 = as.numeric(choice01$X__7[1:18])
  )
1.2. Read in the attributes of alternatives for individuals(n = 7274, or 7290 - 16)
# https://stat.ethz.ch/pipermail/r-help/2003-December/043695.html
alt00 <- as.tibble(
  read.spss("data/alt_07082018.sav", 
            to.data.frame = TRUE, 
            use.value.labesl = FALSE)
  )

alt01 <- alt00 %>%
  mutate(
    id = as.character(alt %/% 100), 
    altid = as.character(alt %% 100), 
    Central = ifelse(CA_Regions == 1, 1, 0), 
    MTC = ifelse(CA_Regions == 2, 1, 0), 
    NorCal = ifelse(CA_Regions == 3, 1, 0), 
    SACOG = ifelse(CA_Regions == 4, 1, 0), 
    SANDAG = ifelse(CA_Regions == 5, 1, 0), 
    SCAG = ifelse(CA_Regions == 6, 1, 0) 
  ) %>%
  select(id, altid, 
         pctnhw_fornhw, pctasn_forasn, pcthis_forhis, pctoldmill, # 4 
         zlnincome, home_ratio, rent_ratio, schoolrate, # +4
         lndist, Amenities, LUmix, Density, # +4  
         Central, MTC, NorCal, SACOG, SANDAG, SCAG, # +6 
         CA_Regions, ccity, urban, suburb, rural_in_urban, rural)
1.3. Calculate utilities for each alternative conditional on belonging to one of the three latent classes
alt01$util1 <- exp(data.matrix(as.data.frame(alt01)[, 3:20]) %*% choice02$coef.choice.1)
alt01$util2 <- exp(data.matrix(as.data.frame(alt01)[, 3:20]) %*% choice02$coef.choice.2)
alt01$util3 <- exp(data.matrix(as.data.frame(alt01)[, 3:20]) %*% choice02$coef.choice.3)

alt02 <- alt01 %>%
  group_by(id) %>% 
  summarize(
    util1_sum = as.numeric(sum(util1)), 
    util2_sum = as.numeric(sum(util2)), 
    util3_sum = as.numeric(sum(util3)), # strange behavior  
    n = n()
  ) 

# check observations with alternatives fewer than ten. 
alt02 %>%
  filter(n < 10) %>%
ggplot() +
  geom_histogram(aes(x = n)) # 8 + 7 + 1 = 16, 7290-16 = 7274 

1.4. Compute the probabilties of choosing alternatives conditional on latent class membership
alt03 <- alt01 %>%
  left_join(alt02) %>%
  select(id, altid, util1, util2, util3, 
         util1_sum, util2_sum, util3_sum, # from alt02 
         ccity, urban, suburb, rural_in_urban, rural) %>%
  mutate(
    prob1 = as.numeric(util1/util1_sum*100), 
    prob2 = as.numeric(util2/util2_sum*100), 
    prob3 = as.numeric(util3/util3_sum*100) 
  ) %>%
  select(id, altid, prob1, prob2, prob3, 
         ccity, urban, suburb, rural_in_urban, rural)

2. Compute the probabilities of individuals belonging to latent classes

2.1. Read in individual attributes
# https://readxl.tidyverse.org/
# The below excel file is copied from M drive. No additional changes are made. 
df00 <- read_excel("data/Model03_barcharts.xlsx", sheet = "resp_07102018_3round_data03")

df01 <- df00 %>% 
  select(pid, case_weight, work_school, nhwhite, education, ownkidsunder6, ownkidsunder12, ownkidsunder17, 
         income, own_house, carlevel, Zpro_suburban_18F, Zcar_as_atool_18F, Zpro_env_policies_18F) %>%
  mutate(
    nhwhite_no = ifelse(nhwhite == 1, 0, 1), 
    edu1 = ifelse(education == 1, 1, 0), 
    edu2 = ifelse(education == 2, 1, 0), 
    edu3 = ifelse(education == 3, 1, 0), 
    edu4 = ifelse(education == 4, 1, 0), 
    inc1 = ifelse(income == 1, 1, 0), 
    inc2 = ifelse(income == 2, 1, 0), 
    inc3 = ifelse(income == 3, 1, 0), 
    house_own = own_house, 
    house_not = ifelse(house_own ==1, 0, 1), 
    car0 = ifelse(carlevel == 0, 1, 0), 
    car1 = ifelse(carlevel == 1, 1, 0), 
    car2 = ifelse(carlevel == 2, 1, 0), 
    car3 = ifelse(carlevel == 3, 1, 0), 
    pro_sub = Zpro_suburban_18F, 
    car_tool = Zcar_as_atool_18F, 
    pro_env = Zpro_env_policies_18F
  ) %>%
  select(pid, case_weight, work_school, nhwhite_no, nhwhite, edu1, edu2, edu3, edu4, 
         ownkidsunder6, ownkidsunder12, ownkidsunder17, inc1, inc2, inc3, house_not, house_own,  
         car0, car1, car2, car3,  
         pro_sub, car_tool, pro_env)

# following several lines are for the calcuation of standardized coefficients
temp <- map_dbl(df01, function(x) wt.sd(x, wt=df01$case_weight))
cov_wt_sd <- tibble(
  cov = names(temp)[3:length(temp)], 
  sd = temp[3:length(temp)]
)
#write_csv(cov_wt_sd, "data/cov_wt_sd.csv")
cov_wt_sd2 <- c(1, cov_wt_sd$sd[1:21])
# end. 

df02 <- cbind(
  constant = rep(1, n = 729), 
  df01[c(4:24)]
)
2.2. Read in the coefficients of the membership model
# The below excel file is copied from M drive. 
# Then, two additional sheets are made based on weighted sd values. 

coef00 <- read_excel("data/Model03_LCCMtable.xlsx", sheet = "member")

coef01 <- coef00 %>% 
  filter(!is.na(X__1)== TRUE) %>%
  select(X__1:X__7)

colnames(coef01) <- c("cov", "coef.1", "z.1", "sig.1", "coef.2", "z.2", "sig.2", "coef.3", "z.3", "sig.3")

coef02 <- coef01 %>% 
  select(cov, coef.1, coef.2, coef.3) %>%
  filter(!is.na(coef.1) == TRUE) #, cov != "No") 

coef02$cov[2:22] <- colnames(df01)[3:23]

coef02$coef.1 <- as.numeric(coef02$coef.1) 
coef02$coef.2 <- as.numeric(coef02$coef.2)
coef02$coef.3 <- as.numeric(coef02$coef.3)

coef02$std.coef.1 <- coef02$coef.1*cov_wt_sd2
coef02$std.coef.2 <- coef02$coef.2*cov_wt_sd2
coef02$std.coef.3 <- coef02$coef.3*cov_wt_sd2

coef02$cov <- factor(coef02$cov, levels = c(coef02$cov))

coef03 <- coef02 %>%
  gather(coef.1:std.coef.3, key = coef_type, value = estimate)
2.3. Compute the probabilities of individuals belonging to latent classes
df03 <- df01 %>% 
  select(pid, case_weight, work_school) %>%
  mutate(
    class1 = exp(as.matrix(df02) %*% coef02[[2]]),
    class2 = exp(as.matrix(df02) %*% coef02[[3]]),
    class3 = exp(as.matrix(df02) %*% coef02[[4]])
  ) 

df03$NHtype.work <- factor(df03$work_school, levels = c(1, 2, 3, 4, 5), 
                           labels = c("Central city", "Urban", "Suburban", 
                                      "Exurban", "Rural"))
df04 <- df03 %>% 
  gather(class1:class3, key = class, value = utility) %>%
  group_by(pid) %>% 
  summarise(
    util_sum = sum(utility)
  )

df05 <- df03 %>% 
  left_join(df04) %>% 
  mutate(
    id = seq.int(nrow(.)), 
    prob.class1 = class1/util_sum, # . placeholder 
    prob.class2 = class2/util_sum, 
    prob.class3 = class3/util_sum 
  ) %>% 
  select(pid, id, case_weight, NHtype.work, prob.class1, prob.class2, prob.class3)

3. Compute weighted shares of individuals choosing certain residential NH type for given work/school NH type.

By case weights and membership probabilities

3.1. Join two tables, alternatives (n=7,274) and individuals (n=729)
alt03$id <- as.integer(alt03$id)

alt04 <- alt03 %>%
  left_join(df05, by = "id") %>%                      # bring in membership probabilities 
  mutate(
    pid = as.integer(pid), 
    altid = as.integer(altid), 
    prob.alt.class1 = prob1, 
    prob.alt.class2 = prob2, 
    prob.alt.class3 = prob3, 
    prob.member1 = prob.class1,
    prob.member2 = prob.class2,
    prob.member3 = prob.class3,
    prob.member1.wt = case_weight * prob.member1,     # case/membership weights are computed here 
    exurb = rural_in_urban, 
    NHtype.home = 0, 
    NHtype.home = ifelse(near(ccity, 1), 1, NHtype.home),  
    NHtype.home = ifelse(near(urban, 1), 2, NHtype.home),  
    NHtype.home = ifelse(near(suburb, 1), 3, NHtype.home),  
    NHtype.home = ifelse(near(exurb, 1), 4, NHtype.home),  
    NHtype.home = ifelse(near(rural, 1), 5, NHtype.home) 
  ) %>%
  select(pid, altid, NHtype.work, 
         prob.alt.class1, prob.alt.class2, prob.alt.class3, 
         prob.member1.wt, NHtype.home)

alt04$NHtype.home = factor(alt04$NHtype.home, levels = c(1, 2, 3, 4, 5), 
                           labels = c("Central city", "Urban", "Suburban", 
                                      "Exurban", "Rural"))
3.2. Compute weighted sum of probabilities by work/school and home NH type
# x = work/school NH, y = percent, class = home NH type 
# position = "fill" (the same hieght of 100% for five bars)

alt05 <- alt04 %>%                                     # case/membership weights are considered here
  mutate(
    prob.alt.class1.wt = prob.alt.class1 * prob.member1.wt/100,
    prob.alt.class2.wt = prob.alt.class2 * prob.member1.wt/100,
    prob.alt.class3.wt = prob.alt.class3 * prob.member1.wt/100
  ) %>%
  select(pid, altid, NHtype.work, NHtype.home, 
         prob.alt.class1.wt, prob.alt.class2.wt, prob.alt.class3.wt, 
         prob.member1.wt) %>%                           # wt.prob.member1: case_weight * prob.member1
  group_by(NHtype.work, NHtype.home) %>%
  summarize(
    prob.alt.class1.wt.sum = as.numeric(sum(prob.alt.class1.wt)),
    prob.alt.class2.wt.sum = as.numeric(sum(prob.alt.class2.wt)),
    prob.alt.class3.wt.sum = as.numeric(sum(prob.alt.class3.wt))
  ) %>%
  select(prob.alt.class1.wt.sum, prob.alt.class2.wt.sum, prob.alt.class3.wt.sum, 
         NHtype.work, NHtype.home)

# Compute % of work/school NH types, which were given to inviduals (my assumption)
alt06 <- alt05 %>%                            
  group_by(NHtype.work) %>%
  summarize(
    By.work.alt.prob.class1.wt.sum = as.numeric(sum(prob.alt.class1.wt.sum)), 
    By.work.alt.prob.class2.wt.sum = as.numeric(sum(prob.alt.class2.wt.sum)), 
    By.work.alt.prob.class3.wt.sum = as.numeric(sum(prob.alt.class3.wt.sum))
  ) %>%
  mutate(                                               # these three sets of values are identical in fact (given)
    By.work.share.class1.wt = round(By.work.alt.prob.class1.wt.sum/sum(By.work.alt.prob.class1.wt.sum)*100, 1), 
    By.work.share.class2.wt = round(By.work.alt.prob.class2.wt.sum/sum(By.work.alt.prob.class2.wt.sum)*100, 1), 
    By.work.share.class3.wt = round(By.work.alt.prob.class3.wt.sum/sum(By.work.alt.prob.class3.wt.sum)*100, 1)
  ) %>%
  select(NHtype.work, By.work.share.class1.wt, By.work.share.class2.wt, By.work.share.class3.wt)

classname <- c(
  "1, Younger Pro-Urban", 
  "2, Affluent, Highly Educated", 
  "3, Middle-class Homeowners"
)
3.3. Create three barcharts
alt05 %>% 
  ggplot() + 
  geom_bar(
    aes(x = factor(NHtype.work, labels=c("Central city (9.5%)", "Urban (27.3%)", "Suburban (43.8%)", 
                                         "Exurban (13.1%)", "Rural (6.4%)")),
        y = prob.alt.class1.wt.sum, 
        fill = NHtype.home), 
    stat = "identity", 
    position = "fill"
  ) + 
  scale_fill_brewer(palette = "RdYlBu", direction = -1) +
  labs(y = "Home Neighborhood Type", x = "Work/school Neighborhood Type", fill = "Home NH Type", 
       title = str_c("With the residential preferences of Latent Class", classname[[1]], sep = " "))  

alt05 %>% 
  ggplot() + 
  geom_bar(
    aes(x = factor(NHtype.work, labels=c("Central city (9.5%)", "Urban (27.3%)", "Suburban (43.8%)", 
                                         "Exurban (13.1%)", "Rural (6.4%)")),
        y = prob.alt.class2.wt.sum, 
        fill = NHtype.home), 
    stat = "identity", 
    position = "fill"
  ) + 
  scale_fill_brewer(palette = "RdYlBu", direction = -1) +
  labs(y = "Home Neighborhood Type", x = "Work/school Neighborhood Type", fill = "Home NH Type", 
       title = str_c("With the residential preferences of Latent Class", classname[[2]], sep = " "))

alt05 %>% 
  ggplot() + 
  geom_bar(
    aes(x = factor(NHtype.work, labels=c("Central city (9.5%)", "Urban (27.3%)", "Suburban (43.8%)", 
                                         "Exurban (13.1%)", "Rural (6.4%)")),
        y = prob.alt.class3.wt.sum, 
        fill = NHtype.home), 
    stat = "identity", 
    position = "fill"
  ) + 
  scale_fill_brewer(palette = "RdYlBu", direction = -1) +
  labs(y = "Home Neighborhood Type", x = "Work/school Neighborhood Type", fill = "Home NH Type", 
       title = str_c("With the residential preferences of Latent Class ", classname[[3]], sep = " "))