# Load libraries
library(readr)
## Warning: package 'readr' was built under R version 3.5.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Load dataset
by_tag_year <- read_csv('datasets/by_tag_year.csv')
## Parsed with column specification:
## cols(
##   year = col_double(),
##   tag = col_character(),
##   number = col_double(),
##   year_total = col_double()
## )
# Inspect the dataset
by_tag_year
## # A tibble: 40,518 x 4
##     year tag           number year_total
##    <dbl> <chr>          <dbl>      <dbl>
##  1  2008 .htaccess         54      58390
##  2  2008 .net            5910      58390
##  3  2008 .net-2.0         289      58390
##  4  2008 .net-3.5         319      58390
##  5  2008 .net-4.0           6      58390
##  6  2008 .net-assembly      3      58390
##  7  2008 .net-core          1      58390
##  8  2008 2d                42      58390
##  9  2008 32-bit            19      58390
## 10  2008 32bit-64bit        4      58390
## # ... with 40,508 more rows
# Add fraction column
by_tag_year_fraction <- by_tag_year %>% 
    mutate(fraction = number / year_total)

# Print the new table
by_tag_year_fraction
## # A tibble: 40,518 x 5
##     year tag           number year_total  fraction
##    <dbl> <chr>          <dbl>      <dbl>     <dbl>
##  1  2008 .htaccess         54      58390 0.000925 
##  2  2008 .net            5910      58390 0.101    
##  3  2008 .net-2.0         289      58390 0.00495  
##  4  2008 .net-3.5         319      58390 0.00546  
##  5  2008 .net-4.0           6      58390 0.000103 
##  6  2008 .net-assembly      3      58390 0.0000514
##  7  2008 .net-core          1      58390 0.0000171
##  8  2008 2d                42      58390 0.000719 
##  9  2008 32-bit            19      58390 0.000325 
## 10  2008 32bit-64bit        4      58390 0.0000685
## # ... with 40,508 more rows
# Filter for R tags
r_over_time <- by_tag_year_fraction %>% 
    filter(tag == 'r')

# Print the new table
r_over_time
## # A tibble: 11 x 5
##     year tag   number year_total fraction
##    <dbl> <chr>  <dbl>      <dbl>    <dbl>
##  1  2008 r          8      58390 0.000137
##  2  2009 r        524     343868 0.00152 
##  3  2010 r       2270     694391 0.00327 
##  4  2011 r       5845    1200551 0.00487 
##  5  2012 r      12221    1645404 0.00743 
##  6  2013 r      22329    2060473 0.0108  
##  7  2014 r      31011    2164701 0.0143  
##  8  2015 r      40844    2219527 0.0184  
##  9  2016 r      44611    2226072 0.0200  
## 10  2017 r      54415    2305207 0.0236  
## 11  2018 r      28938    1085170 0.0267
# Load ggplot2
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.3
# Create a line plot of fraction over time
ggplot(r_over_time, aes(x=year, y=fraction)) +
    geom_line()

# A vector of selected tags
selected_tags <- c('r', 'dplyr', 'ggplot2','tidyr')

# Filter for those tags
selected_tags_over_time <- by_tag_year_fraction %>% 
    filter(tag %in% selected_tags)

# Plot tags over time on a line plot using color to represent tag
ggplot(selected_tags_over_time, aes(x=year, y=fraction, color=tag)) +
    geom_line()

# Find total number of questions for each tag
sorted_tags <- by_tag_year %>%
group_by(tag) %>% summarize(tag_total=n()) %>% arrange(-tag_total)

# Print the new table
print(sorted_tags)
## # A tibble: 4,080 x 2
##    tag           tag_total
##    <chr>             <int>
##  1 .htaccess            11
##  2 .net                 11
##  3 .net-2.0             11
##  4 .net-3.5             11
##  5 .net-4.0             11
##  6 .net-assembly        11
##  7 2d                   11
##  8 32-bit               11
##  9 32bit-64bit          11
## 10 3d                   11
## # ... with 4,070 more rows
# Get the six largest tags
highest_tags <- head(sorted_tags$tag)

# Filter for the six largest tags
by_tag_subset <- filter(by_tag_year_fraction, tag %in% highest_tags)
print(by_tag_subset)
## # A tibble: 66 x 5
##     year tag           number year_total  fraction
##    <dbl> <chr>          <dbl>      <dbl>     <dbl>
##  1  2008 .htaccess         54      58390 0.000925 
##  2  2008 .net            5910      58390 0.101    
##  3  2008 .net-2.0         289      58390 0.00495  
##  4  2008 .net-3.5         319      58390 0.00546  
##  5  2008 .net-4.0           6      58390 0.000103 
##  6  2008 .net-assembly      3      58390 0.0000514
##  7  2009 .htaccess        828     343868 0.00241  
##  8  2009 .net           23076     343868 0.0671   
##  9  2009 .net-2.0         593     343868 0.00172  
## 10  2009 .net-3.5        1087     343868 0.00316  
## # ... with 56 more rows
# Plot tags over time on a line plot using color to represent tag
ggplot(by_tag_subset, aes(x = year, y = fraction, col = tag)) + geom_line()

# Get tags of interest
my_tags <- c("android", "ios", "windows-phone")

# Filter for those tags
by_tag_subset <- filter(by_tag_year_fraction, tag %in% my_tags)

# Plot tags over time on a line plot using color to represent tag
ggplot(by_tag_subset, aes(x = year, y = fraction, col = tag)) + geom_line()