Trump Programming Analysis

This file is not reproducible without a connection to the Stack Overflow internal databases and the internal sqlstackr packages, but was set up to be otherwise transparent.

library(tidyverse)

# Code for creating this comes directly from
# http://rpubs.com/dgrtwo/county-results
county_results <- read_csv("~/Dropbox/county_election_2016.csv")
county_results

## # A tibble: 3,111 × 10
##      County   State CountyCode TotalVotes Clinton Johnson Stein Trump
##       <chr>   <chr>      <int>      <int>   <dbl>   <dbl> <dbl> <dbl>
## 1   Autauga Alabama       1001      24661   0.240   0.022 0.004 0.734
## 2   Baldwin Alabama       1003      94090   0.196   0.026 0.005 0.774
## 3   Barbour Alabama       1005      10390   0.467   0.009 0.002 0.523
## 4      Bibb Alabama       1007       8748   0.214   0.014 0.002 0.770
## 5    Blount Alabama       1009      25384   0.085   0.013 0.004 0.899
## 6   Bullock Alabama       1011       4701   0.751   0.005 0.002 0.242
## 7    Butler Alabama       1013       8685   0.428   0.007 0.001 0.563
## 8   Calhoun Alabama       1015      47376   0.279   0.024 0.006 0.692
## 9  Chambers Alabama       1017      13778   0.418   0.012 0.003 0.566
## 10 Cherokee Alabama       1019      10503   0.145   0.014 0.002 0.839
## # ... with 3,101 more rows, and 2 more variables: Population <dbl>,
## #   Area <dbl>

# This came from US.zip at this address:
# http://download.geonames.org/export/dump/

# Followed by:
# grep '\tPPL' US.txt | awk -F "\t" -v OFS=',' '{print $1,$11,$12}' > US_counties.csv

geoname_counties <- readr::read_csv("~/Downloads/US/US_counties.csv",
                                    col_names = c("GeonameId", "StateCode", "County")) %>%
  inner_join(acs::fips.state, by = c(StateCode = "STUSAB")) %>%
  filter(!is.na(County)) %>%
  transmute(GeonameId, CountyCode = as.integer(paste0(STATE, County)))

library(sqlstackr)

traffic <- tbl_TrafficLite("QuestionViews025") %>%
  filter(Date >= "2015-11-01", Date <= "2016-10-31", !is.null(GeonameId)) %>%
  semi_join(tbl_TrafficLite("Geonames") %>% filter(CountryCode == "US"), by = "GeonameId")

total_traffic <- collect(count(traffic))$n

city_totals <- traffic %>%
  count(GeonameId) %>%
  inner_join(tbl_TrafficLite("Geonames"), by = "GeonameId") %>%
  select(GeonameId, CountryName, RegionName, CityName, VisitsTotal = n, Longitude, Latitude) %>%
  collect(n = Inf) %>%
  arrange(desc(VisitsTotal))

fraction_by_city_US <- function(traffic, tags = NULL, cities = NULL) {
  qtags <- tbl_TrafficLite("QuestionTags")

  if (!is.null(tags)) {
    if (length(tags) > 1) {
      qtags <- qtags %>%
        filter(Tag %in% tags)
    } else {
      qtags <- qtags %>%
        filter(Tag == tags)
    }
  }
  
  tag_by_city <- traffic %>%
    inner_join(qtags, by = "QuestionId") %>%
    count(GeonameId, Tag) %>%
    collect(n = Inf) %>%
    ungroup() %>%
    rename(VisitsTag = n) %>%
    inner_join(city_totals, by = "GeonameId")
  
  tag_by_city
}

selected_tags <- c("javascript", "java", "php", "c#", "android", "python",
                   "html", "ios", "c++", "objective-c", "ruby",
                   "c", "r", "node.js", "swift", "vba", "go")

traffic_by_city <- fraction_by_city_US(traffic, selected_tags)

traffic_by_city

## # A tibble: 218,136 × 9
##    GeonameId        Tag VisitsTag   CountryName     RegionName
##        <int>      <chr>     <int>         <chr>          <chr>
## 1    4651648 javascript      2285 United States      Tennessee
## 2    4851935        c++       245 United States           Iowa
## 3    4552910 javascript         8 United States       Oklahoma
## 4    5113760          c       553 United States       New York
## 5    5118289       java       158 United States       New York
## 6    4219001       java       864 United States        Georgia
## 7    4440413 javascript       186 United States    Mississippi
## 8    4480560     python       282 United States North Carolina
## 9    5262838       html      2599 United States      Wisconsin
## 10   4929399 javascript      2050 United States  Massachusetts
## # ... with 218,126 more rows, and 4 more variables: CityName <chr>,
## #   VisitsTotal <int>, Longitude <dbl>, Latitude <dbl>

county_totals <- city_totals %>%
  inner_join(geoname_counties, by = "GeonameId") %>%
  group_by(CountyCode) %>%
  summarize(CountyTraffic = sum(VisitsTotal)) %>%
  inner_join(county_results, by = c("CountyCode"))

county_totals

## # A tibble: 3,093 × 11
##    CountyCode CountyTraffic   County   State TotalVotes Clinton Johnson
##         <int>         <int>    <chr>   <chr>      <int>   <dbl>   <dbl>
## 1        1001          7493  Autauga Alabama      24661   0.240   0.022
## 2        1003         32188  Baldwin Alabama      94090   0.196   0.026
## 3        1005          1400  Barbour Alabama      10390   0.467   0.009
## 4        1007          1049     Bibb Alabama       8748   0.214   0.014
## 5        1009          3660   Blount Alabama      25384   0.085   0.013
## 6        1011           145  Bullock Alabama       4701   0.751   0.005
## 7        1013           806   Butler Alabama       8685   0.428   0.007
## 8        1015         20730  Calhoun Alabama      47376   0.279   0.024
## 9        1017         10880 Chambers Alabama      13778   0.418   0.012
## 10       1019           609 Cherokee Alabama      10503   0.145   0.014
## # ... with 3,083 more rows, and 4 more variables: Stein <dbl>,
## #   Trump <dbl>, Population <dbl>, Area <dbl>

traffic_by_county <- traffic_by_city %>%
  inner_join(geoname_counties, by = "GeonameId") %>%
  group_by(Tag, State = RegionName, CountyCode) %>%
  summarize(TagTraffic = sum(VisitsTag)) %>%
  ungroup() %>%
  inner_join(county_totals, by = c("CountyCode", "State")) %>%
  mutate(PercentTag = TagTraffic / CountyTraffic)

traffic_by_county

## # A tibble: 46,434 × 14
##        Tag   State CountyCode TagTraffic CountyTraffic   County TotalVotes
##      <chr>   <chr>      <int>      <int>         <int>    <chr>      <int>
## 1  android Alabama       1001        219          7493  Autauga      24661
## 2  android Alabama       1003       1214         32188  Baldwin      94090
## 3  android Alabama       1005         52          1400  Barbour      10390
## 4  android Alabama       1007         90          1049     Bibb       8748
## 5  android Alabama       1009        294          3660   Blount      25384
## 6  android Alabama       1011         16           145  Bullock       4701
## 7  android Alabama       1013         35           806   Butler       8685
## 8  android Alabama       1015        627         20730  Calhoun      47376
## 9  android Alabama       1017        321         10880 Chambers      13778
## 10 android Alabama       1019         33           609 Cherokee      10503
## # ... with 46,424 more rows, and 7 more variables: Clinton <dbl>,
## #   Johnson <dbl>, Stein <dbl>, Trump <dbl>, Population <dbl>, Area <dbl>,
## #   PercentTag <dbl>

traffic_by_county %>%
  distinct(CountyCode, .keep_all = TRUE) %>%
  mutate(Density = Population / Area) %>%
  arrange(desc(Population)) %>%
  ggplot(aes(Density, Trump)) +
  geom_point() +
  scale_x_log10() +
  scale_y_continuous(labels = percent_format()) +
  xlab("Population density (ppl / square mile)") +
  ylab("% of votes going to Trump") +
  geom_text(aes(label = County), vjust = 1, hjust = 1, check_overlap = TRUE) +
  geom_smooth(method = "lm") +
  ggtitle("Population density vs Trump voters by county")

traffic_by_county %>%
  filter(Tag %in% c("r"), CountyTraffic > 20000) %>%
  arrange(desc(CountyTraffic)) %>%
  ggplot(aes(PercentTag, Trump)) +
  geom_point() +
  geom_smooth(method = "lm") +
  geom_text(aes(label = County), check_overlap = TRUE, vjust = 1, hjust = 1) +
  scale_x_log10(breaks = c(.003, .01, .03, .1), labels = percent_format()) +
  scale_y_continuous(labels = percent_format()) +
  xlab("% of county's Stack Overflow traffic going to R questions") +
  ylab("% of county's vote going to Donald Trump") +
  ggtitle("R usage vs 2016 voting by county")

traffic_by_county %>%
  filter(Tag %in% c("c#", "php", "python", "r"), CountyTraffic > 20000) %>%
  ggplot(aes(PercentTag, Trump)) +
  geom_point() +
  geom_smooth(method = "lm") +
  geom_text(aes(label = County), check_overlap = TRUE, vjust = 1, hjust = 1) +
  facet_wrap(~ Tag, scales = "free_x") +
  scale_x_log10(breaks = c(.01, .03, .1, .3), labels = percent_format()) +
  scale_y_continuous(labels = percent_format()) +
  xlab("% of county's Stack Overflow traffic going to this tag") +
  ylab("% of county's vote going to Donald Trump") +
  ggtitle("Tag usage vs 2016 voting by county")

options(scipen = 5)

traffic_by_county %>%
  distinct(CountyCode, .keep_all = TRUE) %>%
  filter(Population > 1000) %>%
  ggplot(aes(x = Population, y = CountyTraffic / Population)) +
  geom_point() +
  scale_x_log10(labels = comma_format()) +
  scale_y_log10(labels = comma_format()) +
  geom_text(aes(label = County), vjust = 1, hjust = 1, check_overlap = TRUE) +
  geom_smooth(method = "lm") +
  ylab("Stack Overflow visits per person in one year")

traffic_by_county %>%
  distinct(CountyCode, .keep_all = TRUE) %>%
  filter(Population > 100) %>%
  ggplot(aes(x = Population, y = Trump)) +
  geom_point() +
  scale_x_log10(labels = comma_format()) +
  scale_y_continuous(labels = percent_format()) +
  geom_text(aes(label = County), vjust = 1, hjust = 1, check_overlap = TRUE) +
  geom_smooth(method = "lm") +
  ylab("% of votes going to Trump")

library(purrr)
library(broom)

traffic_by_county %>%
  filter(CountyTraffic > 10000) %>%
  nest(-Tag) %>%
  unnest(map(data, ~ tidy(lm(Trump ~ log2(PercentTag), data = .)))) %>%
  filter(term != "(Intercept)") %>%
  arrange(estimate)

## # A tibble: 17 × 6
##            Tag             term      estimate   std.error     statistic
##          <chr>            <chr>         <dbl>       <dbl>         <dbl>
## 1       python log2(PercentTag) -9.110843e-02 0.007169990 -12.706912619
## 2            r log2(PercentTag) -6.093624e-02 0.003792688 -16.066769402
## 3      node.js log2(PercentTag) -5.208387e-02 0.008155999  -6.385958466
## 4         java log2(PercentTag) -4.707053e-02 0.008835629  -5.327354932
## 5         ruby log2(PercentTag) -4.483379e-02 0.005600394  -8.005471832
## 6  objective-c log2(PercentTag) -3.624041e-02 0.005975325  -6.065009954
## 7            c log2(PercentTag) -2.958341e-02 0.005953212  -4.969320001
## 8          ios log2(PercentTag) -2.666991e-02 0.006910798  -3.859164871
## 9           go log2(PercentTag) -2.503059e-02 0.003733304  -6.704674289
## 10       swift log2(PercentTag) -2.247680e-02 0.005272107  -4.263342904
## 11         c++ log2(PercentTag) -1.576413e-02 0.006496722  -2.426474500
## 12     android log2(PercentTag) -7.378442e-05 0.009764012  -0.007556773
## 13  javascript log2(PercentTag)  3.519171e-02 0.017444381   2.017366327
## 14         vba log2(PercentTag)  4.827056e-02 0.006716626   7.186727677
## 15          c# log2(PercentTag)  8.020841e-02 0.007542917  10.633605446
## 16         php log2(PercentTag)  8.675845e-02 0.008530970  10.169821971
## 17        html log2(PercentTag)  1.580460e-01 0.015772951  10.020066574
## # ... with 1 more variables: p.value <dbl>

Bit of bonus; predicting Trump support from all tags at once with a beta regression:

library(betareg)

trump_tag_data <- traffic_by_county %>%
  filter(CountyTraffic > 10000) %>%
  select(Tag, CountyCode, PercentTag, Trump) %>%
  spread(Tag, PercentTag, fill = 0) %>%
  select(-CountyCode)

betareg_coefs <- betareg(Trump ~ ., data = trump_tag_data) %>%
  tidy() %>%
  filter(component == "mean", term != "(Intercept)") %>%
  arrange(estimate)

betareg_coefs

##    component          term    estimate std.error  statistic      p.value
## 1       mean `objective-c` -22.2309041 9.7892146 -2.2709589 2.314947e-02
## 2       mean             r  -7.0717738 1.5555604 -4.5461261 5.464230e-06
## 3       mean       node.js  -3.6502891 5.0315100 -0.7254858 4.681540e-01
## 4       mean          ruby  -3.6091106 3.3059903 -1.0916882 2.749702e-01
## 5       mean    javascript  -1.8767147 1.3422662 -1.3981687 1.620624e-01
## 6       mean       android  -1.5972272 1.4173827 -1.1268850 2.597911e-01
## 7       mean        python  -0.8621655 0.9393548 -0.9178274 3.587093e-01
## 8       mean         swift   1.1140689 6.4383365  0.1730368 8.626225e-01
## 9       mean            go   1.3630681 9.0427812  0.1507355 8.801844e-01
## 10      mean          java   1.8281697 0.8285557  2.2064536 2.735225e-02
## 11      mean         `c++`   3.8069893 0.9761192  3.9001275 9.614205e-05
## 12      mean             c   4.1933760 1.4011132  2.9928887 2.763505e-03
## 13      mean          `c#`   5.1563143 1.1064639  4.6601741 3.159421e-06
## 14      mean          html   8.4362619 2.7073576  3.1160501 1.832912e-03
## 15      mean           php  10.0666766 1.7056251  5.9020454 3.590223e-09
## 16      mean           ios  10.9018731 7.5181622  1.4500716 1.470386e-01
## 17      mean           vba  13.5927564 3.5069128  3.8759893 1.061924e-04

betareg_coefs %>%
  ggplot(aes(estimate, p.value)) +
  geom_point() +
  geom_text(aes(label = term), vjust = 1, hjust = 1) +
  scale_y_log10()

(Note that when other terms are included, Python is not significant, likely because it is highly correlated with R).

Trump Programming Analysis

David Robinson

November 16, 2016