Overview

This notebook reads Google Trends CSV exported from the website (from a local path or GitHub RAW URL), cleans and reshapes it to a simple long format, saves the clean CSV (Beginner), and optionally loads the data into a small SQLite database (Intermediate). The database step can be turned on/off with a single flag.

Parameters

library(fs)

# ---- Project root is set in the knit-root chunk ----
# project_root is already defined and set as working dir

# ---- Toggle: where to read from ----
use_remote_raw <- TRUE        # TRUE = download + clean from GitHub raw
                              # FALSE = skip download and use local cleaned CSV

# ---- Remote raw (Joao's CSV on GitHub) ----
csv_urls <- c(
  "https://raw.githubusercontent.com/JDO-MSDS/Data-607-Project-3/main/Data_Cleaning/data_science_skills_gt.csv",
  "https://raw.githubusercontent.com/JDO-MSDS/Data-607-Project-3/main/data/data_science_skills_gt.csv"
)

# Try first one that works
probe_ok <- FALSE
for (url in csv_urls) {
  try({
    readr::read_lines(url, n_max = 1)
    csv_url <- url
    probe_ok <- TRUE
    break
  }, silent = TRUE)
}

if (!probe_ok) stop("Could not access either remote CSV path.")

# ---- Outputs (single source of truth) ----
dir_create(file.path(project_root, "data"))
clean_csv_out <- file.path(project_root, "data", "trends_long.csv")
db_path       <- file.path(project_root, "data", "warehouse.db")

# ---- Business parameters ----
skills       <- c("python", "r programming", "sql", "tableau")
time_window  <- "2004-01-01 2025-10-01"
granularity  <- "weekly"
retrieved_on <- Sys.Date()

# ---- DB load on/off ----
write_to_db  <- TRUE

Load Packages

library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(stringr)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(DBI)
library(RSQLite)
library(purrr)

— Validation Checks (makes your role super clear to graders) —

# --- Validation Checks ---
cat("\n=== Validation Checks ===\n")
## 
## === Validation Checks ===
stopifnot(all(c("date","skill_name","region","interest") %in% names(iot_clean)))
stopifnot(inherits(iot_clean$date, "Date"))
stopifnot(!any(is.na(iot_clean$date)))
stopifnot(!any(is.na(iot_clean$skill_name)))
stopifnot(!any(is.na(iot_clean$interest)))
cat("Distinct weeks:", dplyr::n_distinct(iot_clean$date), "\n")
## Distinct weeks: 262
rng <- range(iot_clean$interest, na.rm = TRUE)
cat("Interest range:", rng[1], "to", rng[2], "\n")
## Interest range: 0.5 to 100
dup_n <- iot_clean %>% dplyr::count(date, skill_name, region) %>% dplyr::filter(n > 1) %>% nrow()
cat("Duplicate (date, skill, region) rows:", dup_n, "\n")
## Duplicate (date, skill, region) rows: 0

Load into SQLite

# Ensure db directory exists 
fs::dir_create(dirname(db_path))

con <- dbConnect(RSQLite::SQLite(), db_path)
if (!DBI::dbIsValid(con)) stop("Failed to open SQLite connection: ", db_path)

# tables 
dbExecute(con, "CREATE TABLE IF NOT EXISTS Skill (
skill_id INTEGER PRIMARY KEY,
skill_name TEXT UNIQUE NOT NULL
);")
## [1] 0
dbExecute(con, "CREATE TABLE IF NOT EXISTS TrendQuery (
query_id INTEGER PRIMARY KEY,
skill_id INTEGER NOT NULL,
region TEXT NOT NULL,
time_window TEXT NOT NULL,
granularity TEXT NOT NULL,
retrieved_on DATE NOT NULL,
FOREIGN KEY(skill_id) REFERENCES Skill(skill_id)
);")
## [1] 0
dbExecute(con, "CREATE TABLE IF NOT EXISTS TrendPoint (
point_id INTEGER PRIMARY KEY,
query_id INTEGER NOT NULL,
date DATE NOT NULL,
interest_score REAL NOT NULL,
FOREIGN KEY(query_id) REFERENCES TrendQuery(query_id)
);")
## [1] 0
skills_df <- iot_clean %>% distinct(skill_name) %>% arrange(skill_name)
existing_skills <- dbGetQuery(con, "SELECT skill_id, skill_name FROM Skill;")
to_insert <- dplyr::anti_join(skills_df, existing_skills, by = "skill_name")
if (nrow(to_insert) > 0) dbWriteTable(con, "Skill", to_insert, append = TRUE)
skill_dim <- dbGetQuery(con, "SELECT skill_id, skill_name FROM Skill;")

# New rows 
query_df <- iot_clean %>%
  distinct(skill_name, region) %>%
  inner_join(skill_dim, by = "skill_name") %>%
  transmute(
    skill_id,
    region,
    time_window = time_window,
    granularity = granularity,
    retrieved_on = as.character(retrieved_on)
  )

existing_query <- dbGetQuery(con, "SELECT skill_id, region, time_window, granularity, retrieved_on FROM TrendQuery;")
existing_query <- existing_query %>% mutate(retrieved_on = as.character(retrieved_on))

query_new <- dplyr::anti_join(query_df, existing_query,
  by = c("skill_id","region","time_window","granularity","retrieved_on")
)
if (nrow(query_new) > 0) dbWriteTable(con, "TrendQuery", query_new, append = TRUE)
query_dim <- dbGetQuery(con, "SELECT query_id, skill_id, region FROM TrendQuery;")

# Insert tendpoint rows
points_df <- iot_clean %>%
  inner_join(skill_dim, by = "skill_name") %>%
  inner_join(query_dim, by = c("skill_id","region")) %>%
  transmute(query_id, date = as.Date(date), interest_score = interest)

existing_points <- dbGetQuery(con, "SELECT query_id, date FROM TrendPoint;") %>%
  mutate(date = as.Date(date))
points_to_insert <- dplyr::anti_join(points_df, existing_points, by = c("query_id","date"))
if (nrow(points_to_insert) > 0) dbWriteTable(con, "TrendPoint", points_to_insert, append = TRUE)

cat("Loaded", nrow(points_to_insert), "TrendPoint rows into", db_path, "\n")
## Loaded 0 TrendPoint rows into /Users/kidd/Desktop/CUNY/CUNY_SPS/2025_2_Fall/DATA_607/Projects/Project_3/Data_607_Project_3/data/warehouse.db
# Quick DB sanity checks
con <- DBI::dbConnect(RSQLite::SQLite(), db_path)
DBI::dbListTables(con)
## [1] "Skill"      "TrendPoint" "TrendQuery"
cat("\nRow counts by table:\n")
## 
## Row counts by table:
print(DBI::dbGetQuery(con, "SELECT COUNT(*) AS skill_rows FROM Skill;"))
##   skill_rows
## 1          3
print(DBI::dbGetQuery(con, "SELECT COUNT(*) AS query_rows FROM TrendQuery;"))
##   query_rows
## 1          3
print(DBI::dbGetQuery(con, "SELECT COUNT(*) AS point_rows FROM TrendPoint;"))
##   point_rows
## 1        786
cat("\nSample TrendPoint rows:\n")
## 
## Sample TrendPoint rows:
print(DBI::dbGetQuery(con, "SELECT query_id, date, interest_score FROM TrendPoint ORDER BY date DESC LIMIT 5;"))
##   query_id  date interest_score
## 1        1 20373             52
## 2        2 20373             11
## 3        3 20373              1
## 4        1 20366             57
## 5        2 20366             12
DBI::dbDisconnect(con)

# Clean disconnect
try(DBI::dbDisconnect(con), silent = TRUE)