Assignment9_web

Step 1: Import libraries

library(httr)
library(jsonlite)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(knitr)

Step 2: Place New York Times API key

source("config.R") # My key is saved in config file, put your key here
api_key <- NYT_API_KEY

Step 3: Making API request to fetch data

# Get data
response <- GET(
  "https://api.nytimes.com/svc/topstories/v2/technology.json",
  query = list("api-key" = api_key)
)

cat("Status:", response$status_code, "\n")

## Status: 200

Step 4: Parsing JSON and creating DataFrame

data <- fromJSON(content(response, "text"))

## No encoding supplied: defaulting to UTF-8.

cat("Top level names:", names(data), "\n")

## Top level names: status copyright section last_updated num_results results

cat("Has results:", !is.null(data$results), "\n")

## Has results: TRUE

if (!is.null(data$results)) {
  articles <- data$results
  cat("Results type:", class(articles), "\n")
  cat("Results dimensions:", dim(articles), "\n")
  cat("Column names:", names(articles), "\n")
  
  if (nrow(articles) > 0) {
    cat("\nFirst article title:", articles$title[1], "\n")
    cat("First article section:", articles$section[1], "\n")
    cat("First article date:", articles$published_date[1], "\n")
  }
}

## Results type: data.frame 
## Results dimensions: 29 19 
## Column names: section subsection title abstract url uri byline item_type updated_date created_date published_date material_type_facet kicker des_facet org_facet per_facet geo_facet multimedia short_url 
## 
## First article title: Meta Layoffs Included Employees Who Monitored Risks to User Privacy 
## First article section: technology 
## First article date: 2025-10-23T19:03:46-04:00

Step 5: Dataframe with only the main columns

# dataframe with only the main columns
df <- data.frame(
  title = articles$title,
  section = articles$section,
  abstract = articles$abstract,
  url = articles$url,
  byline = articles$byline,
  published_date = as.Date(articles$published_date),
  item_type = articles$item_type,
  stringsAsFactors = FALSE
)

cat("✅ Created simplified DataFrame with", ncol(df), "columns\n")

## ✅ Created simplified DataFrame with 7 columns

cat("📊 Dimensions:", nrow(df), "rows ×", ncol(df), "columns\n\n")

## 📊 Dimensions: 29 rows × 7 columns

# Show structure
str(df)

## 'data.frame':    29 obs. of  7 variables:
##  $ title         : chr  "Meta Layoffs Included Employees Who Monitored Risks to User Privacy" "Trump Pardons Founder of the Crypto Exchange Binance" "Google’s Quantum Computer Makes a Big Technical Leap" "Reddit Accuses ‘Data Scraper’ Companies of Stealing Its Information" ...
##  $ section       : chr  "technology" "technology" "technology" "technology" ...
##  $ abstract      : chr  "While the company announced job cuts in artificial intelligence, it also expanded plans to replace privacy and "| __truncated__ "Changpeng Zhao, the richest man in crypto, had admitted to money-laundering violations that allowed terrorists "| __truncated__ "Designed to accelerate advances in medicine and other fields, the tech giant’s quantum algorithm runs 13,000 ti"| __truncated__ "In a lawsuit, Reddit pulled back the curtain on an ecosystem of start-ups that scrape Google’s search results a"| __truncated__ ...
##  $ url           : chr  "https://www.nytimes.com/2025/10/23/technology/meta-layoffs-user-privacy.html" "https://www.nytimes.com/2025/10/23/technology/trump-pardons-cz-binance.html" "https://www.nytimes.com/2025/10/22/technology/googles-quantum-computer-leap.html" "https://www.nytimes.com/2025/10/22/technology/reddit-data-scrapers-perplexity-theft.html" ...
##  $ byline        : chr  "By Mike Isaac and Eli Tan" "By David Yaffe-Bellany and Kenneth P. Vogel" "By Cade Metz" "By Mike Isaac" ...
##  $ published_date: Date, format: "2025-10-23" "2025-10-23" ...
##  $ item_type     : chr  "Article" "Article" "Article" "Article" ...

# Save
write.csv(df, "nyt_technology_data.csv", row.names = FALSE)
cat("💾 Saved to: nyt_technology_data.csv\n")

## 💾 Saved to: nyt_technology_data.csv

Step 6: Queries regarding dataframe

# Summary statistics
cat("\n=== SUMMARY STATISTICS ===\n")

## 
## === SUMMARY STATISTICS ===

cat("Total articles:", nrow(df), "\n")

## Total articles: 29

cat("Date range:", as.character(min(df$published_date)), "to", 
    as.character(max(df$published_date)), "\n")

## Date range: 2025-10-16 to 2025-10-24

cat("Unique sections:", length(unique(df$section)), "\n")

## Unique sections: 7

# Articles by section
cat("\n=== ARTICLES BY SECTION ===\n")

## 
## === ARTICLES BY SECTION ===

section_counts <- table(df$section)
print(section_counts)

## 
##       arts   business     health   magazine   podcasts    science technology 
##          1          8          1          2          2          2         13

cat("Most frequent section:", names(which.max(section_counts)), "\n")

## Most frequent section: technology

# Title length analysis
cat("\n=== TITLE ANALYSIS ===\n")

## 
## === TITLE ANALYSIS ===

df$title_length <- nchar(df$title)
cat("Average title length:", round(mean(df$title_length)), "characters\n")

## Average title length: 62 characters

cat("Longest title:", max(df$title_length), "characters\n")

## Longest title: 101 characters

cat("Shortest title:", min(df$title_length), "characters\n")

## Shortest title: 38 characters

# Find articles containing specific keywords
cat("\n=== ARTICLES ABOUT AI ===\n")

## 
## === ARTICLES ABOUT AI ===

ai_articles <- df[grepl("AI|artificial intelligence", df$title, ignore.case = TRUE), ]
cat("Found", nrow(ai_articles), "articles about AI:\n")

## Found 4 articles about AI:

kable(ai_articles[, c("title", "published_date")])

	title	published_date
13	OpenAI Unveils Web Browser Built for Artificial Intelligence	2025-10-21
14	G.M. Raises Profit Forecast on Strong Demand and Lower Tariff Costs	2025-10-21
25	California Regulates A.I. Companions + OpenAI Investigates Its Critics + The Hard Fork Review of Slop	2025-10-17
29	China’s Rare Earth Restrictions Aim to Beat U.S. at Its Own Game	2025-10-16

# Latest articles
cat("\n=== LATEST 5 ARTICLES ===\n")

## 
## === LATEST 5 ARTICLES ===

latest_articles <- df[order(df$published_date, decreasing = TRUE), ]
kable(head(latest_articles[, c("title", "published_date", "section")], 5))

	title	published_date	section
5	Celebrities Fight Sora + Amazon’s Secret Automation Plans + ChatGPT Gets a Browser	2025-10-24	podcasts
6	A Teen in Love With a Chatbot Killed Himself. Can the Chatbot Be Held Responsible?	2025-10-24	magazine
1	Meta Layoffs Included Employees Who Monitored Risks to User Privacy	2025-10-23	technology
2	Trump Pardons Founder of the Crypto Exchange Binance	2025-10-23	technology
7	Ford’s Profit Jumps on Strong Sales but Company Lowers its Outlook	2025-10-23	business

# Find articles containing specific keywords
cat("\n=== ARTICLES ABOUT AI ===\n")

## 
## === ARTICLES ABOUT AI ===

ai_articles <- df[grepl("AI|artificial intelligence", df$title, ignore.case = TRUE), ]
cat("Found", nrow(ai_articles), "articles about AI:\n")

## Found 4 articles about AI:

kable(ai_articles[, c("title", "published_date")])

	title	published_date
13	OpenAI Unveils Web Browser Built for Artificial Intelligence	2025-10-21
14	G.M. Raises Profit Forecast on Strong Demand and Lower Tariff Costs	2025-10-21
25	California Regulates A.I. Companions + OpenAI Investigates Its Critics + The Hard Fork Review of Slop	2025-10-17
29	China’s Rare Earth Restrictions Aim to Beat U.S. at Its Own Game	2025-10-16

# Latest articles
cat("\n=== LATEST 5 ARTICLES ===\n")

## 
## === LATEST 5 ARTICLES ===

latest_articles <- df[order(df$published_date, decreasing = TRUE), ]
kable(head(latest_articles[, c("title", "published_date", "section")], 5))

	title	published_date	section
5	Celebrities Fight Sora + Amazon’s Secret Automation Plans + ChatGPT Gets a Browser	2025-10-24	podcasts
6	A Teen in Love With a Chatbot Killed Himself. Can the Chatbot Be Held Responsible?	2025-10-24	magazine
1	Meta Layoffs Included Employees Who Monitored Risks to User Privacy	2025-10-23	technology
2	Trump Pardons Founder of the Crypto Exchange Binance	2025-10-23	technology
7	Ford’s Profit Jumps on Strong Sales but Company Lowers its Outlook	2025-10-23	business

# Check for missing values
cat("\n=== MISSING VALUES ===\n")

## 
## === MISSING VALUES ===

missing_summary <- sapply(df, function(x) sum(is.na(x)))
print(missing_summary)

##          title        section       abstract            url         byline 
##              0              0              0              0              0 
## published_date      item_type   title_length 
##              0              0              0

# Check for empty strings
cat("\n=== EMPTY STRINGS ===\n")

## 
## === EMPTY STRINGS ===

empty_strings <- sapply(df, function(x) sum(x == "" | is.na(x)))
print(empty_strings)

##          title        section       abstract            url         byline 
##              0              0              0              0              0 
## published_date      item_type   title_length 
##             NA              0              0

Assignment9_web_APIs

Mehreen Ali Gillani

2025-10-24

Step 1: Import libraries

Step 2: Place New York Times API key

Step 3: Making API request to fetch data

Step 4: Parsing JSON and creating DataFrame

Step 5: Dataframe with only the main columns

Step 6: Queries regarding dataframe