This notebook reads Google Trends CSV exported from the website (from a local path or GitHub RAW URL), cleans and reshapes it to a simple long format, saves the clean CSV (Beginner), and optionally loads the data into a small SQLite database (Intermediate). The database step can be turned on/off with a single flag.
library(fs)
# ---- Project root is set in the knit-root chunk ----
# project_root is already defined and set as working dir
# ---- Toggle: where to read from ----
use_remote_raw <- TRUE # TRUE = download + clean from GitHub raw
# FALSE = skip download and use local cleaned CSV
# ---- Remote raw (Joao's CSV on GitHub) ----
csv_urls <- c(
"https://raw.githubusercontent.com/JDO-MSDS/Data-607-Project-3/main/Data_Cleaning/data_science_skills_gt.csv",
"https://raw.githubusercontent.com/JDO-MSDS/Data-607-Project-3/main/data/data_science_skills_gt.csv"
)
# Try first one that works
probe_ok <- FALSE
for (url in csv_urls) {
try({
readr::read_lines(url, n_max = 1)
csv_url <- url
probe_ok <- TRUE
break
}, silent = TRUE)
}
if (!probe_ok) stop("Could not access either remote CSV path.")
# ---- Outputs (single source of truth) ----
dir_create(file.path(project_root, "data"))
clean_csv_out <- file.path(project_root, "data", "trends_long.csv")
db_path <- file.path(project_root, "data", "warehouse.db")
# ---- Business parameters ----
skills <- c("python", "r programming", "sql", "tableau")
time_window <- "2004-01-01 2025-10-01"
granularity <- "weekly"
retrieved_on <- Sys.Date()
# ---- DB load on/off ----
write_to_db <- TRUE
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(stringr)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(DBI)
library(RSQLite)
library(purrr)
detect_header_row <- function(path_or_url) {
lines <- readr::read_lines(path_or_url, n_max = 200)
if (length(lines) && startsWith(lines[1], "\ufeff")) {
lines[1] <- sub("^\ufeff", "", lines[1])
}
for (i in seq_along(lines)) {
parts <- strsplit(lines[i], ",", fixed = TRUE)[[1]]
if (length(parts) > 1) {
first <- trimws(parts[1])
if (grepl("^Week$|^Date$", first, ignore.case = TRUE)) return(i - 1L)
if (!is.na(suppressWarnings(lubridate::ymd(first)))) return(i - 1L)
if (!is.na(suppressWarnings(lubridate::mdy(first)))) return(i - 1L)
}
}
stop("Could not find a header with a date or Week/Date column in: ", path_or_url)
}
read_trends_csv <- function(path_or_url, default_region = "US") {
skip_n <- detect_header_row(path_or_url)
df <- readr::read_csv(path_or_url, skip = skip_n, show_col_types = FALSE)
# Normalizing first column to 'date'
names(df)[1] <- "date"
df <- df %>% mutate(
date = as.character(date),
date = sub("^\ufeff", "", date),
date = sub("\\s+-\\s+.*$", "", date) # keep start of a range like "2020-01-01 - 2020-01-07"
)
parsed <- suppressWarnings(lubridate::ymd(df$date))
parsed2 <- suppressWarnings(lubridate::mdy(df$date))
df$date <- ifelse(is.na(parsed), parsed2, parsed)
df <- df %>% filter(!is.na(date))
# only date + columns that mention any of our skills
nm <- names(df)
skill_pattern <- paste0("(", paste(stringr::str_to_lower(skills), collapse = "|"), ")")
lower_nm <- stringr::str_to_lower(nm)
skill_match <- stringr::str_detect(lower_nm, skill_pattern)
keep_cols <- unique(c("date", nm[skill_match]))
df <- dplyr::select(df, dplyr::all_of(keep_cols))
# make all non-date columns character
df <- df %>% mutate(across(-date, as.character))
if (ncol(df) <= 1) {
stop(
"No series columns matched your skills in CSV: ", path_or_url,
"\nAvailable columns:\n", paste(nm, collapse = " | "),
"\nTip: confirm headers include your terms (e.g., 'python', 'r programming', 'sql', 'tableau')."
)
}
# Long format
long <- df %>%
tidyr::pivot_longer(cols = -date, names_to = "raw_col", values_to = "interest") %>%
dplyr::mutate(
interest = ifelse(interest == "<1", "0.5", as.character(interest)),
interest = suppressWarnings(as.numeric(interest)),
term = stringr::str_trim(
stringr::str_to_lower(stringr::str_remove(raw_col, "\\s*\\(.*\\)$"))
),
region = dplyr::case_when(
stringr::str_detect(raw_col, fixed("(United States)")) ~ "US",
stringr::str_detect(raw_col, fixed("(Worldwide)")) ~ "WORLD",
TRUE ~ default_region
)
) %>%
dplyr::filter(!is.na(interest))
canon <- function(x) {
x %>%
stringr::str_replace(stringr::fixed("(search term)"), "") %>%
stringr::str_replace(stringr::fixed("(programming language)"), "") %>%
stringr::str_replace_all("[^A-Za-z0-9[:space:]]", " ") %>%
stringr::str_squish()
}
long %>%
dplyr::mutate(skill_name = canon(term)) %>%
dplyr::select(date, skill_name, region, interest) %>%
dplyr::arrange(skill_name, date)
}
load_all_from_sources <- function(url_vec) {
if (length(url_vec) == 0) stop("No CSV sources provided.")
purrr::map_df(url_vec, read_trends_csv)
}
# --- Clean & load ---
message("csv_urls = ", paste(csv_urls, collapse = " | "))
## csv_urls = https://raw.githubusercontent.com/JDO-MSDS/Data-607-Project-3/main/Data_Cleaning/data_science_skills_gt.csv | https://raw.githubusercontent.com/JDO-MSDS/Data-607-Project-3/main/data/data_science_skills_gt.csv
message("clean_csv_out = ", clean_csv_out)
## clean_csv_out = /Users/kidd/Desktop/CUNY/CUNY_SPS/2025_2_Fall/DATA_607/Projects/Project_3/Data_607_Project_3/data/trends_long.csv
iot_clean <- NULL
remote_ok <- FALSE
# Probe remote
probe_ok <- tryCatch({
readr::read_lines(csv_urls[1], n_max = 1)
TRUE
}, error = function(e) FALSE)
if (probe_ok) {
message("Remote reachable. Loading and cleaning...")
iot_clean <- tryCatch({
load_all_from_sources(csv_urls) %>%
dplyr::mutate(
skill_name = dplyr::case_when(
skill_name %in% c("r","r programming","r programming language","r programming") ~ "r programming",
TRUE ~ skill_name
)
) %>%
dplyr::filter(stringr::str_to_lower(skill_name) %in% stringr::str_to_lower(skills))
}, error = function(e) {
message("Remote load failed: ", conditionMessage(e))
NULL
})
remote_ok <- !is.null(iot_clean)
} else {
message("Remote not reachable (URL, privacy, or firewall).")
}
## Remote not reachable (URL, privacy, or firewall).
# Fallback to local cleaned file
if (!remote_ok) {
if (file.exists(clean_csv_out)) {
message("Using local cleaned file: ", clean_csv_out)
iot_clean <- readr::read_csv(clean_csv_out, show_col_types = FALSE)
} else {
stop(paste0(
"Could not fetch remote AND local cleaned file not found.\n",
"Check the URL and network access; or create: ", clean_csv_out, "\n"
))
}
}
## Using local cleaned file: /Users/kidd/Desktop/CUNY/CUNY_SPS/2025_2_Fall/DATA_607/Projects/Project_3/Data_607_Project_3/data/trends_long.csv
# --- Normalize key columns (handles both numeric and string dates) ---
iot_clean <- iot_clean %>%
dplyr::mutate(
date = suppressWarnings(as.Date(date, origin = "1970-01-01")),
date = ifelse(is.na(date), suppressWarnings(as.Date(as.character(date))), date)
) %>%
dplyr::mutate(date = as.Date(date, origin = "1970-01-01")) %>%
dplyr::arrange(skill_name, date)
# --- Simple checks now see proper Date type ---
dup <- iot_clean %>% dplyr::count(date, skill_name, region) %>% dplyr::filter(n > 1)
if (nrow(dup) > 0) message("Duplicate (date, skill, region) rows: ", nrow(dup))
iot_clean %>%
dplyr::group_by(skill_name) %>%
dplyr::summarise(
min_date = min(date),
max_date = max(date),
rows = dplyr::n(),
mean_interest = mean(interest, na.rm = TRUE),
.groups = "drop"
) %>% print()
## # A tibble: 3 × 5
## skill_name min_date max_date rows mean_interest
## <chr> <date> <date> <int> <dbl>
## 1 python 2020-10-11 2025-10-12 262 65.2
## 2 sql 2020-10-11 2025-10-12 262 16.5
## 3 tableau 2020-10-11 2025-10-12 262 1.20
# Persist/confirm cleaned CSV
if (probe_ok && remote_ok) {
fs::dir_create(dirname(clean_csv_out))
readr::write_csv(iot_clean, clean_csv_out)
cat("\nWrote clean long-format CSV to:", clean_csv_out, "\n")
} else {
cat("\nUsing existing local cleaned CSV:", clean_csv_out, "\n")
}
##
## Using existing local cleaned CSV: /Users/kidd/Desktop/CUNY/CUNY_SPS/2025_2_Fall/DATA_607/Projects/Project_3/Data_607_Project_3/data/trends_long.csv
# --- Validation Checks ---
cat("\n=== Validation Checks ===\n")
##
## === Validation Checks ===
stopifnot(all(c("date","skill_name","region","interest") %in% names(iot_clean)))
stopifnot(inherits(iot_clean$date, "Date"))
stopifnot(!any(is.na(iot_clean$date)))
stopifnot(!any(is.na(iot_clean$skill_name)))
stopifnot(!any(is.na(iot_clean$interest)))
cat("Distinct weeks:", dplyr::n_distinct(iot_clean$date), "\n")
## Distinct weeks: 262
rng <- range(iot_clean$interest, na.rm = TRUE)
cat("Interest range:", rng[1], "to", rng[2], "\n")
## Interest range: 0.5 to 100
dup_n <- iot_clean %>% dplyr::count(date, skill_name, region) %>% dplyr::filter(n > 1) %>% nrow()
cat("Duplicate (date, skill, region) rows:", dup_n, "\n")
## Duplicate (date, skill, region) rows: 0
# Ensure db directory exists
fs::dir_create(dirname(db_path))
con <- dbConnect(RSQLite::SQLite(), db_path)
if (!DBI::dbIsValid(con)) stop("Failed to open SQLite connection: ", db_path)
# tables
dbExecute(con, "CREATE TABLE IF NOT EXISTS Skill (
skill_id INTEGER PRIMARY KEY,
skill_name TEXT UNIQUE NOT NULL
);")
## [1] 0
dbExecute(con, "CREATE TABLE IF NOT EXISTS TrendQuery (
query_id INTEGER PRIMARY KEY,
skill_id INTEGER NOT NULL,
region TEXT NOT NULL,
time_window TEXT NOT NULL,
granularity TEXT NOT NULL,
retrieved_on DATE NOT NULL,
FOREIGN KEY(skill_id) REFERENCES Skill(skill_id)
);")
## [1] 0
dbExecute(con, "CREATE TABLE IF NOT EXISTS TrendPoint (
point_id INTEGER PRIMARY KEY,
query_id INTEGER NOT NULL,
date DATE NOT NULL,
interest_score REAL NOT NULL,
FOREIGN KEY(query_id) REFERENCES TrendQuery(query_id)
);")
## [1] 0
skills_df <- iot_clean %>% distinct(skill_name) %>% arrange(skill_name)
existing_skills <- dbGetQuery(con, "SELECT skill_id, skill_name FROM Skill;")
to_insert <- dplyr::anti_join(skills_df, existing_skills, by = "skill_name")
if (nrow(to_insert) > 0) dbWriteTable(con, "Skill", to_insert, append = TRUE)
skill_dim <- dbGetQuery(con, "SELECT skill_id, skill_name FROM Skill;")
# New rows
query_df <- iot_clean %>%
distinct(skill_name, region) %>%
inner_join(skill_dim, by = "skill_name") %>%
transmute(
skill_id,
region,
time_window = time_window,
granularity = granularity,
retrieved_on = as.character(retrieved_on)
)
existing_query <- dbGetQuery(con, "SELECT skill_id, region, time_window, granularity, retrieved_on FROM TrendQuery;")
existing_query <- existing_query %>% mutate(retrieved_on = as.character(retrieved_on))
query_new <- dplyr::anti_join(query_df, existing_query,
by = c("skill_id","region","time_window","granularity","retrieved_on")
)
if (nrow(query_new) > 0) dbWriteTable(con, "TrendQuery", query_new, append = TRUE)
query_dim <- dbGetQuery(con, "SELECT query_id, skill_id, region FROM TrendQuery;")
# Insert tendpoint rows
points_df <- iot_clean %>%
inner_join(skill_dim, by = "skill_name") %>%
inner_join(query_dim, by = c("skill_id","region")) %>%
transmute(query_id, date = as.Date(date), interest_score = interest)
existing_points <- dbGetQuery(con, "SELECT query_id, date FROM TrendPoint;") %>%
mutate(date = as.Date(date))
points_to_insert <- dplyr::anti_join(points_df, existing_points, by = c("query_id","date"))
if (nrow(points_to_insert) > 0) dbWriteTable(con, "TrendPoint", points_to_insert, append = TRUE)
cat("Loaded", nrow(points_to_insert), "TrendPoint rows into", db_path, "\n")
## Loaded 0 TrendPoint rows into /Users/kidd/Desktop/CUNY/CUNY_SPS/2025_2_Fall/DATA_607/Projects/Project_3/Data_607_Project_3/data/warehouse.db
# Quick DB sanity checks
con <- DBI::dbConnect(RSQLite::SQLite(), db_path)
DBI::dbListTables(con)
## [1] "Skill" "TrendPoint" "TrendQuery"
cat("\nRow counts by table:\n")
##
## Row counts by table:
print(DBI::dbGetQuery(con, "SELECT COUNT(*) AS skill_rows FROM Skill;"))
## skill_rows
## 1 3
print(DBI::dbGetQuery(con, "SELECT COUNT(*) AS query_rows FROM TrendQuery;"))
## query_rows
## 1 3
print(DBI::dbGetQuery(con, "SELECT COUNT(*) AS point_rows FROM TrendPoint;"))
## point_rows
## 1 786
cat("\nSample TrendPoint rows:\n")
##
## Sample TrendPoint rows:
print(DBI::dbGetQuery(con, "SELECT query_id, date, interest_score FROM TrendPoint ORDER BY date DESC LIMIT 5;"))
## query_id date interest_score
## 1 1 20373 52
## 2 2 20373 11
## 3 3 20373 1
## 4 1 20366 57
## 5 2 20366 12
DBI::dbDisconnect(con)
# Clean disconnect
try(DBI::dbDisconnect(con), silent = TRUE)