This file is not reproducible without a connection to the Stack Overflow internal databases and the internal sqlstackr packages, but was set up to be otherwise transparent.
library(tidyverse)
# Code for creating this comes directly from
# http://rpubs.com/dgrtwo/county-results
county_results <- read_csv("~/Dropbox/county_election_2016.csv")
county_results
## # A tibble: 3,111 × 10
## County State CountyCode TotalVotes Clinton Johnson Stein Trump
## <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 Autauga Alabama 1001 24661 0.240 0.022 0.004 0.734
## 2 Baldwin Alabama 1003 94090 0.196 0.026 0.005 0.774
## 3 Barbour Alabama 1005 10390 0.467 0.009 0.002 0.523
## 4 Bibb Alabama 1007 8748 0.214 0.014 0.002 0.770
## 5 Blount Alabama 1009 25384 0.085 0.013 0.004 0.899
## 6 Bullock Alabama 1011 4701 0.751 0.005 0.002 0.242
## 7 Butler Alabama 1013 8685 0.428 0.007 0.001 0.563
## 8 Calhoun Alabama 1015 47376 0.279 0.024 0.006 0.692
## 9 Chambers Alabama 1017 13778 0.418 0.012 0.003 0.566
## 10 Cherokee Alabama 1019 10503 0.145 0.014 0.002 0.839
## # ... with 3,101 more rows, and 2 more variables: Population <dbl>,
## # Area <dbl>
# This came from US.zip at this address:
# http://download.geonames.org/export/dump/
# Followed by:
# grep '\tPPL' US.txt | awk -F "\t" -v OFS=',' '{print $1,$11,$12}' > US_counties.csv
geoname_counties <- readr::read_csv("~/Downloads/US/US_counties.csv",
col_names = c("GeonameId", "StateCode", "County")) %>%
inner_join(acs::fips.state, by = c(StateCode = "STUSAB")) %>%
filter(!is.na(County)) %>%
transmute(GeonameId, CountyCode = as.integer(paste0(STATE, County)))
library(sqlstackr)
traffic <- tbl_TrafficLite("QuestionViews025") %>%
filter(Date >= "2015-11-01", Date <= "2016-10-31", !is.null(GeonameId)) %>%
semi_join(tbl_TrafficLite("Geonames") %>% filter(CountryCode == "US"), by = "GeonameId")
total_traffic <- collect(count(traffic))$n
city_totals <- traffic %>%
count(GeonameId) %>%
inner_join(tbl_TrafficLite("Geonames"), by = "GeonameId") %>%
select(GeonameId, CountryName, RegionName, CityName, VisitsTotal = n, Longitude, Latitude) %>%
collect(n = Inf) %>%
arrange(desc(VisitsTotal))
fraction_by_city_US <- function(traffic, tags = NULL, cities = NULL) {
qtags <- tbl_TrafficLite("QuestionTags")
if (!is.null(tags)) {
if (length(tags) > 1) {
qtags <- qtags %>%
filter(Tag %in% tags)
} else {
qtags <- qtags %>%
filter(Tag == tags)
}
}
tag_by_city <- traffic %>%
inner_join(qtags, by = "QuestionId") %>%
count(GeonameId, Tag) %>%
collect(n = Inf) %>%
ungroup() %>%
rename(VisitsTag = n) %>%
inner_join(city_totals, by = "GeonameId")
tag_by_city
}
selected_tags <- c("javascript", "java", "php", "c#", "android", "python",
"html", "ios", "c++", "objective-c", "ruby",
"c", "r", "node.js", "swift", "vba", "go")
traffic_by_city <- fraction_by_city_US(traffic, selected_tags)
traffic_by_city
## # A tibble: 218,136 × 9
## GeonameId Tag VisitsTag CountryName RegionName
## <int> <chr> <int> <chr> <chr>
## 1 4651648 javascript 2285 United States Tennessee
## 2 4851935 c++ 245 United States Iowa
## 3 4552910 javascript 8 United States Oklahoma
## 4 5113760 c 553 United States New York
## 5 5118289 java 158 United States New York
## 6 4219001 java 864 United States Georgia
## 7 4440413 javascript 186 United States Mississippi
## 8 4480560 python 282 United States North Carolina
## 9 5262838 html 2599 United States Wisconsin
## 10 4929399 javascript 2050 United States Massachusetts
## # ... with 218,126 more rows, and 4 more variables: CityName <chr>,
## # VisitsTotal <int>, Longitude <dbl>, Latitude <dbl>
county_totals <- city_totals %>%
inner_join(geoname_counties, by = "GeonameId") %>%
group_by(CountyCode) %>%
summarize(CountyTraffic = sum(VisitsTotal)) %>%
inner_join(county_results, by = c("CountyCode"))
county_totals
## # A tibble: 3,093 × 11
## CountyCode CountyTraffic County State TotalVotes Clinton Johnson
## <int> <int> <chr> <chr> <int> <dbl> <dbl>
## 1 1001 7493 Autauga Alabama 24661 0.240 0.022
## 2 1003 32188 Baldwin Alabama 94090 0.196 0.026
## 3 1005 1400 Barbour Alabama 10390 0.467 0.009
## 4 1007 1049 Bibb Alabama 8748 0.214 0.014
## 5 1009 3660 Blount Alabama 25384 0.085 0.013
## 6 1011 145 Bullock Alabama 4701 0.751 0.005
## 7 1013 806 Butler Alabama 8685 0.428 0.007
## 8 1015 20730 Calhoun Alabama 47376 0.279 0.024
## 9 1017 10880 Chambers Alabama 13778 0.418 0.012
## 10 1019 609 Cherokee Alabama 10503 0.145 0.014
## # ... with 3,083 more rows, and 4 more variables: Stein <dbl>,
## # Trump <dbl>, Population <dbl>, Area <dbl>
traffic_by_county <- traffic_by_city %>%
inner_join(geoname_counties, by = "GeonameId") %>%
group_by(Tag, State = RegionName, CountyCode) %>%
summarize(TagTraffic = sum(VisitsTag)) %>%
ungroup() %>%
inner_join(county_totals, by = c("CountyCode", "State")) %>%
mutate(PercentTag = TagTraffic / CountyTraffic)
traffic_by_county
## # A tibble: 46,434 × 14
## Tag State CountyCode TagTraffic CountyTraffic County TotalVotes
## <chr> <chr> <int> <int> <int> <chr> <int>
## 1 android Alabama 1001 219 7493 Autauga 24661
## 2 android Alabama 1003 1214 32188 Baldwin 94090
## 3 android Alabama 1005 52 1400 Barbour 10390
## 4 android Alabama 1007 90 1049 Bibb 8748
## 5 android Alabama 1009 294 3660 Blount 25384
## 6 android Alabama 1011 16 145 Bullock 4701
## 7 android Alabama 1013 35 806 Butler 8685
## 8 android Alabama 1015 627 20730 Calhoun 47376
## 9 android Alabama 1017 321 10880 Chambers 13778
## 10 android Alabama 1019 33 609 Cherokee 10503
## # ... with 46,424 more rows, and 7 more variables: Clinton <dbl>,
## # Johnson <dbl>, Stein <dbl>, Trump <dbl>, Population <dbl>, Area <dbl>,
## # PercentTag <dbl>
traffic_by_county %>%
distinct(CountyCode, .keep_all = TRUE) %>%
mutate(Density = Population / Area) %>%
arrange(desc(Population)) %>%
ggplot(aes(Density, Trump)) +
geom_point() +
scale_x_log10() +
scale_y_continuous(labels = percent_format()) +
xlab("Population density (ppl / square mile)") +
ylab("% of votes going to Trump") +
geom_text(aes(label = County), vjust = 1, hjust = 1, check_overlap = TRUE) +
geom_smooth(method = "lm") +
ggtitle("Population density vs Trump voters by county")
traffic_by_county %>%
filter(Tag %in% c("r"), CountyTraffic > 20000) %>%
arrange(desc(CountyTraffic)) %>%
ggplot(aes(PercentTag, Trump)) +
geom_point() +
geom_smooth(method = "lm") +
geom_text(aes(label = County), check_overlap = TRUE, vjust = 1, hjust = 1) +
scale_x_log10(breaks = c(.003, .01, .03, .1), labels = percent_format()) +
scale_y_continuous(labels = percent_format()) +
xlab("% of county's Stack Overflow traffic going to R questions") +
ylab("% of county's vote going to Donald Trump") +
ggtitle("R usage vs 2016 voting by county")
traffic_by_county %>%
filter(Tag %in% c("c#", "php", "python", "r"), CountyTraffic > 20000) %>%
ggplot(aes(PercentTag, Trump)) +
geom_point() +
geom_smooth(method = "lm") +
geom_text(aes(label = County), check_overlap = TRUE, vjust = 1, hjust = 1) +
facet_wrap(~ Tag, scales = "free_x") +
scale_x_log10(breaks = c(.01, .03, .1, .3), labels = percent_format()) +
scale_y_continuous(labels = percent_format()) +
xlab("% of county's Stack Overflow traffic going to this tag") +
ylab("% of county's vote going to Donald Trump") +
ggtitle("Tag usage vs 2016 voting by county")
options(scipen = 5)
traffic_by_county %>%
distinct(CountyCode, .keep_all = TRUE) %>%
filter(Population > 1000) %>%
ggplot(aes(x = Population, y = CountyTraffic / Population)) +
geom_point() +
scale_x_log10(labels = comma_format()) +
scale_y_log10(labels = comma_format()) +
geom_text(aes(label = County), vjust = 1, hjust = 1, check_overlap = TRUE) +
geom_smooth(method = "lm") +
ylab("Stack Overflow visits per person in one year")
traffic_by_county %>%
distinct(CountyCode, .keep_all = TRUE) %>%
filter(Population > 100) %>%
ggplot(aes(x = Population, y = Trump)) +
geom_point() +
scale_x_log10(labels = comma_format()) +
scale_y_continuous(labels = percent_format()) +
geom_text(aes(label = County), vjust = 1, hjust = 1, check_overlap = TRUE) +
geom_smooth(method = "lm") +
ylab("% of votes going to Trump")
library(purrr)
library(broom)
traffic_by_county %>%
filter(CountyTraffic > 10000) %>%
nest(-Tag) %>%
unnest(map(data, ~ tidy(lm(Trump ~ log2(PercentTag), data = .)))) %>%
filter(term != "(Intercept)") %>%
arrange(estimate)
## # A tibble: 17 × 6
## Tag term estimate std.error statistic
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 python log2(PercentTag) -9.110843e-02 0.007169990 -12.706912619
## 2 r log2(PercentTag) -6.093624e-02 0.003792688 -16.066769402
## 3 node.js log2(PercentTag) -5.208387e-02 0.008155999 -6.385958466
## 4 java log2(PercentTag) -4.707053e-02 0.008835629 -5.327354932
## 5 ruby log2(PercentTag) -4.483379e-02 0.005600394 -8.005471832
## 6 objective-c log2(PercentTag) -3.624041e-02 0.005975325 -6.065009954
## 7 c log2(PercentTag) -2.958341e-02 0.005953212 -4.969320001
## 8 ios log2(PercentTag) -2.666991e-02 0.006910798 -3.859164871
## 9 go log2(PercentTag) -2.503059e-02 0.003733304 -6.704674289
## 10 swift log2(PercentTag) -2.247680e-02 0.005272107 -4.263342904
## 11 c++ log2(PercentTag) -1.576413e-02 0.006496722 -2.426474500
## 12 android log2(PercentTag) -7.378442e-05 0.009764012 -0.007556773
## 13 javascript log2(PercentTag) 3.519171e-02 0.017444381 2.017366327
## 14 vba log2(PercentTag) 4.827056e-02 0.006716626 7.186727677
## 15 c# log2(PercentTag) 8.020841e-02 0.007542917 10.633605446
## 16 php log2(PercentTag) 8.675845e-02 0.008530970 10.169821971
## 17 html log2(PercentTag) 1.580460e-01 0.015772951 10.020066574
## # ... with 1 more variables: p.value <dbl>
Bit of bonus; predicting Trump support from all tags at once with a beta regression:
library(betareg)
trump_tag_data <- traffic_by_county %>%
filter(CountyTraffic > 10000) %>%
select(Tag, CountyCode, PercentTag, Trump) %>%
spread(Tag, PercentTag, fill = 0) %>%
select(-CountyCode)
betareg_coefs <- betareg(Trump ~ ., data = trump_tag_data) %>%
tidy() %>%
filter(component == "mean", term != "(Intercept)") %>%
arrange(estimate)
betareg_coefs
## component term estimate std.error statistic p.value
## 1 mean `objective-c` -22.2309041 9.7892146 -2.2709589 2.314947e-02
## 2 mean r -7.0717738 1.5555604 -4.5461261 5.464230e-06
## 3 mean node.js -3.6502891 5.0315100 -0.7254858 4.681540e-01
## 4 mean ruby -3.6091106 3.3059903 -1.0916882 2.749702e-01
## 5 mean javascript -1.8767147 1.3422662 -1.3981687 1.620624e-01
## 6 mean android -1.5972272 1.4173827 -1.1268850 2.597911e-01
## 7 mean python -0.8621655 0.9393548 -0.9178274 3.587093e-01
## 8 mean swift 1.1140689 6.4383365 0.1730368 8.626225e-01
## 9 mean go 1.3630681 9.0427812 0.1507355 8.801844e-01
## 10 mean java 1.8281697 0.8285557 2.2064536 2.735225e-02
## 11 mean `c++` 3.8069893 0.9761192 3.9001275 9.614205e-05
## 12 mean c 4.1933760 1.4011132 2.9928887 2.763505e-03
## 13 mean `c#` 5.1563143 1.1064639 4.6601741 3.159421e-06
## 14 mean html 8.4362619 2.7073576 3.1160501 1.832912e-03
## 15 mean php 10.0666766 1.7056251 5.9020454 3.590223e-09
## 16 mean ios 10.9018731 7.5181622 1.4500716 1.470386e-01
## 17 mean vba 13.5927564 3.5069128 3.8759893 1.061924e-04
betareg_coefs %>%
ggplot(aes(estimate, p.value)) +
geom_point() +
geom_text(aes(label = term), vjust = 1, hjust = 1) +
scale_y_log10()
(Note that when other terms are included, Python is not significant, likely because it is highly correlated with R).