# Load libraries
library(readr)
## Warning: package 'readr' was built under R version 3.5.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Load dataset
by_tag_year <- read_csv('datasets/by_tag_year.csv')
## Parsed with column specification:
## cols(
## year = col_double(),
## tag = col_character(),
## number = col_double(),
## year_total = col_double()
## )
# Inspect the dataset
by_tag_year
## # A tibble: 40,518 x 4
## year tag number year_total
## <dbl> <chr> <dbl> <dbl>
## 1 2008 .htaccess 54 58390
## 2 2008 .net 5910 58390
## 3 2008 .net-2.0 289 58390
## 4 2008 .net-3.5 319 58390
## 5 2008 .net-4.0 6 58390
## 6 2008 .net-assembly 3 58390
## 7 2008 .net-core 1 58390
## 8 2008 2d 42 58390
## 9 2008 32-bit 19 58390
## 10 2008 32bit-64bit 4 58390
## # ... with 40,508 more rows
# Add fraction column
by_tag_year_fraction <- by_tag_year %>%
mutate(fraction = number / year_total)
# Print the new table
by_tag_year_fraction
## # A tibble: 40,518 x 5
## year tag number year_total fraction
## <dbl> <chr> <dbl> <dbl> <dbl>
## 1 2008 .htaccess 54 58390 0.000925
## 2 2008 .net 5910 58390 0.101
## 3 2008 .net-2.0 289 58390 0.00495
## 4 2008 .net-3.5 319 58390 0.00546
## 5 2008 .net-4.0 6 58390 0.000103
## 6 2008 .net-assembly 3 58390 0.0000514
## 7 2008 .net-core 1 58390 0.0000171
## 8 2008 2d 42 58390 0.000719
## 9 2008 32-bit 19 58390 0.000325
## 10 2008 32bit-64bit 4 58390 0.0000685
## # ... with 40,508 more rows
# Filter for R tags
r_over_time <- by_tag_year_fraction %>%
filter(tag == 'r')
# Print the new table
r_over_time
## # A tibble: 11 x 5
## year tag number year_total fraction
## <dbl> <chr> <dbl> <dbl> <dbl>
## 1 2008 r 8 58390 0.000137
## 2 2009 r 524 343868 0.00152
## 3 2010 r 2270 694391 0.00327
## 4 2011 r 5845 1200551 0.00487
## 5 2012 r 12221 1645404 0.00743
## 6 2013 r 22329 2060473 0.0108
## 7 2014 r 31011 2164701 0.0143
## 8 2015 r 40844 2219527 0.0184
## 9 2016 r 44611 2226072 0.0200
## 10 2017 r 54415 2305207 0.0236
## 11 2018 r 28938 1085170 0.0267
# Load ggplot2
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.3
# Create a line plot of fraction over time
ggplot(r_over_time, aes(x=year, y=fraction)) +
geom_line()

# A vector of selected tags
selected_tags <- c('r', 'dplyr', 'ggplot2','tidyr')
# Filter for those tags
selected_tags_over_time <- by_tag_year_fraction %>%
filter(tag %in% selected_tags)
# Plot tags over time on a line plot using color to represent tag
ggplot(selected_tags_over_time, aes(x=year, y=fraction, color=tag)) +
geom_line()

# Find total number of questions for each tag
sorted_tags <- by_tag_year %>%
group_by(tag) %>% summarize(tag_total=n()) %>% arrange(-tag_total)
# Print the new table
print(sorted_tags)
## # A tibble: 4,080 x 2
## tag tag_total
## <chr> <int>
## 1 .htaccess 11
## 2 .net 11
## 3 .net-2.0 11
## 4 .net-3.5 11
## 5 .net-4.0 11
## 6 .net-assembly 11
## 7 2d 11
## 8 32-bit 11
## 9 32bit-64bit 11
## 10 3d 11
## # ... with 4,070 more rows
# Get the six largest tags
highest_tags <- head(sorted_tags$tag)
# Filter for the six largest tags
by_tag_subset <- filter(by_tag_year_fraction, tag %in% highest_tags)
print(by_tag_subset)
## # A tibble: 66 x 5
## year tag number year_total fraction
## <dbl> <chr> <dbl> <dbl> <dbl>
## 1 2008 .htaccess 54 58390 0.000925
## 2 2008 .net 5910 58390 0.101
## 3 2008 .net-2.0 289 58390 0.00495
## 4 2008 .net-3.5 319 58390 0.00546
## 5 2008 .net-4.0 6 58390 0.000103
## 6 2008 .net-assembly 3 58390 0.0000514
## 7 2009 .htaccess 828 343868 0.00241
## 8 2009 .net 23076 343868 0.0671
## 9 2009 .net-2.0 593 343868 0.00172
## 10 2009 .net-3.5 1087 343868 0.00316
## # ... with 56 more rows
# Plot tags over time on a line plot using color to represent tag
ggplot(by_tag_subset, aes(x = year, y = fraction, col = tag)) + geom_line()

# Get tags of interest
my_tags <- c("android", "ios", "windows-phone")
# Filter for those tags
by_tag_subset <- filter(by_tag_year_fraction, tag %in% my_tags)
# Plot tags over time on a line plot using color to represent tag
ggplot(by_tag_subset, aes(x = year, y = fraction, col = tag)) + geom_line()
