Question 1. What is the Percentage of R questions in 2020?
Question 2 What are The top five programming languages with questions between 2015 and 2020 (2020 inclusive)?
Load the data dictionary to understand the data
data_dict <- read.csv("C:/Users/MUSAAB-TECH/OneDrive/JOHNBULL/data/workspace/workspace/stack_overflow_data_dictionary.csv")
data_dict
## Column
## 1 year
## 2 tag
## 3 num_questions
## 4 year_total
## Description
## 1 The year the question was asked (2008-2020)
## 2 A word or phrase that describes the topic of the question, such as the programming language
## 3 The number of questions with a certain tag in that year
## 4 The total number of questions asked in that year
file <- "C:/Users/MUSAAB-TECH/OneDrive/JOHNBULL/data/workspace/workspace/stack_overflow_data.csv"
# read file
data <- read.csv(file)
head(data)
## year tag num_questions year_total
## 1 2008 treeview 69 168541
## 2 2008 scheduled-tasks 30 168541
## 3 2008 specifications 21 168541
## 4 2008 rendering 35 168541
## 5 2008 http-post 6 168541
## 6 2008 static-assert 1 168541
str(data)
## 'data.frame': 420066 obs. of 4 variables:
## $ year : int 2008 2008 2008 2008 2008 2008 2008 2008 2008 2008 ...
## $ tag : chr "treeview" "scheduled-tasks" "specifications" "rendering" ...
## $ num_questions: int 69 30 21 35 6 1 159 10 4 20 ...
## $ year_total : int 168541 168541 168541 168541 168541 168541 168541 168541 168541 168541 ...
summary(data)
## year tag num_questions year_total
## Min. :2008 Length:420066 Min. : 1.0 Min. : 168541
## 1st Qu.:2012 Class :character 1st Qu.: 2.0 1st Qu.:4787010
## Median :2015 Mode :character Median : 7.0 Median :5621997
## Mean :2015 Mean : 142.6 Mean :5222995
## 3rd Qu.:2018 3rd Qu.: 29.0 3rd Qu.:6431458
## Max. :2020 Max. :264379.0 Max. :6612772
# load packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# First filter for year 2020
r_2020 <- data %>%
filter (year == 2020, tag == "r") %>%
# add a summary column for percentage
mutate(
percentage = round((num_questions / year_total) * 100, 2)
) %>%
#select the data to display
select(year, tag, num_questions, year_total, percentage)
# verify the result
r_2020
## year tag num_questions year_total percentage
## 1 2020 r 52662 5452545 0.97
ggplot(r_2020, aes(x = tag, y = percentage)) +
geom_col(fill = "skyblue", show.legend = FALSE, width = 0.5) + # Bar color and width
geom_text(
# sprintf ensures it shows two decimal places as per round() function.
aes(label = sprintf("%.2f%%", percentage)),
vjust = -0.7, # Adjust vertical position to be above the bar
size = 3.5 # Adjust text size as needed
) +
scale_y_continuous(
# format the y value to show with two decimal places and a "%" sign.
labels = scales::label_number(accuracy = 0.01, suffix = "%"),
# Expand the y-axis to give space for the text label above the bar
expand = expansion(mult = c(0, 0.15))
) +
labs(
title = "Percentage of R Questions in 2020",
x = "Tag", # The tag is "r"
y = "Percentage of Questions"
) +
theme_minimal(base_size = 12) + # A clean theme with a base font size
theme(
panel.grid.major = element_blank(), # Remove major grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines
axis.line = element_line(colour = "black"), # Add back axis lines for clarity
plot.title = element_text(hjust = 0.5) # Center the plot title
)
# Filter the data for the years 2015 to 2020
filtered_data <- data %>% filter(year >= 2015 & year <= 2020)
# Summarize the total number of questions for each tag
tag_totals <- filtered_data %>%
group_by(tag) %>%
summarise(total_questions = sum(num_questions, na.rm = TRUE)) %>%
arrange(desc(total_questions))
# Identify the five tags with the highest total number of questions
highest_tags <- tag_totals %>%
slice_max(total_questions, n = 5) %>%
select(tag, total_questions)
highest_tags
## # A tibble: 5 × 2
## tag total_questions
## <chr> <int>
## 1 javascript 1373634
## 2 python 1187838
## 3 java 982747
## 4 android 737330
## 5 c# 730045
ggplot(highest_tags, aes(x = reorder(tag, -total_questions), y = total_questions, fill = tag)) +
geom_col(show.legend = FALSE) +
# show.legend = FALSE to remove cluttered tags and legend
labs(title = "Programing Language with the highest total number of questions 2015-2020",
x = "Programming Language",
y = "Total Number of Questions") +
# theme minimal: for a cleaner look
theme_minimal() +
geom_text(aes(label = total_questions), vjust = -0.5, size = 3) +
theme(
panel.grid.major = element_blank(), # Removes major grid lines
panel.grid.minor = element_blank(), # Removes minor grid lines
)
filtered_data <- data %>%
filter(year >= 2015 & year <= 2020 & tag %in% highest_tags$tag)
ggplot(filtered_data, aes(x = year, y = num_questions, color = tag)) +
geom_line() +
labs(title = "Trend of Top 5 Programming Languages (2015-2020)",
x = "Year",
y = "Number of Questions") +
theme_minimal()
# Filter the data for the years 2010 to 2020
# Filter for python and R tags
python_r_trends <- data %>%
filter(year >= 2010 & year <= 2020) %>%
filter(tag == "python" | tag == "r")
head(python_r_trends)
## year tag num_questions year_total
## 1 2010 r 2264 1970729
## 2 2010 python 27029 1970729
## 3 2011 python 42169 3473395
## 4 2011 r 5835 3473395
## 5 2012 python 64254 4787010
## 6 2012 r 12183 4787010
ggplot(python_r_trends, aes(x = year, y = num_questions, color = tag)) +
geom_line() +
labs(title = "Python vs R Popularity Trend (2010-2020)",
x = "Year",
y = "Number of Questions") +
theme_minimal()
# Further analysis: Percentage of total questions per year for each language
percentage_trends <- data %>%
group_by(year) %>%
mutate(total_questions_year = sum(num_questions)) %>%
ungroup() %>%
mutate(percentage_of_total = (num_questions / total_questions_year) * 100)
# Plot percentage trend for top 5 languages
ggplot(percentage_trends %>% filter(tag %in% highest_tags$tag & year >= 2015 & year <= 2020),
aes(x = year, y = percentage_of_total, color = tag)) +
geom_line() +
labs(title = "Percentage of Total Questions per Year for Top 5 Languages (2015-2020)",
x = "Year",
y = "Percentage of Total Questions") +
theme_minimal()
# Further analysis: Percentage of total questions per year for python and R
# Plot percentage trend for Python and R
ggplot(percentage_trends %>% filter((tag == "python" | tag == "r") & year >= 2010 & year <= 2020),
aes(x = year, y = percentage_of_total, color = tag)) +
geom_line() +
labs(title = "Percentage of Total Questions per Year for Python vs R (2010-2020)",
x = "Year",
y = "Percentage of Total Questions") +
theme_minimal()
library(RColorBrewer) # colour-blind-safe palettes
library(ggrepel) # smart text placement
# pick five clearly separated hues
lang_pal <- brewer.pal(5, "Set1") # red, blue, green, purple, orange
ggplot(filtered_data,
aes(x = year, y = num_questions,
colour = tag, group = tag)) +
geom_line(size = 1.1) +
# add labels at the line end (2020)
geom_text_repel(
data = filtered_data |> dplyr::filter(year == max(year)),
aes(label = tag),
hjust = 0, nudge_x = 0.4, # push a little to the right
direction = "y", segment.color = NA # no line segment, cleaner look
) +
scale_colour_manual(values = lang_pal, name = "Language") +
coord_cartesian(clip = "off") + # let labels stick out to the right
theme_minimal(base_size = 12) +
theme(plot.margin = margin(5.5, 40, 5.5, 5.5)) # room for labels
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ggplot(percentage_trends |>
dplyr::filter(tag %in% highest_tags$tag,
dplyr::between(year, 2015, 2020)),
aes(year, percentage_of_total,
colour = tag, group = tag)) +
geom_line(size = 1.1) +
geom_text_repel(
data = subset(percentage_trends, year == max(year) & tag %in% highest_tags$tag),
aes(label = tag),
hjust = 0, nudge_x = 0.4, segment.color = NA
) +
scale_colour_manual(values = lang_pal, name = "Language (%)") +
coord_cartesian(clip = "off") +
theme_minimal(base_size = 12) +
theme(plot.margin = margin(5.5, 40, 5.5, 5.5))