Establish S3 Connection
# install.packages("aws.s3")
# install.packages("rjson")
library(aws.s3)
library(rstudioapi) #askForSecret
library(rjson)
library(knitr) # opts_knit$set(root.dir = "~/path") set directory for the entire workbook
library(DT)
library(dplyr)
aws_credential <- fromJSON(file='aws_credential.txt')
Sys.setenv("AWS_ACCESS_KEY_ID" = aws_credential$access_key,
"AWS_SECRET_ACCESS_KEY" = aws_credential$secret_key,
"AWS_DEFAULT_REGION" = "us-east-2")
bucket_info <- get_bucket(aws_credential$bucket, region = 'us-east-2')
Explore Objects in Bucket
Create data catalog
names(bucket_info$Contents)
## [1] "Key" "LastModified" "ETag"
## [4] "ChecksumAlgorithm" "ChecksumType" "Size"
## [7] "Owner" "StorageClass" "Bucket"
length(bucket_info)
## [1] 285
key_list <- vector(mode = 'character',length(bucket_info))
modify_time_list <- vector(mode = 'character',length(bucket_info))
size_list <- vector(mode = 'character', length(bucket_info))
for (i in 1:length(bucket_info))
{
key_list[i] <- bucket_info[[i]]$Key
modify_time_list[i] <- bucket_info[[i]]$LastModified
size_list[i] <- round(bucket_info[[i]]$Size / 1024)
}
bucket_items <- data.frame(key = key_list, last_modify = modify_time_list, size_KB = size_list)
List selected objects
subset_criteria <- grepl(pattern = 'visa_scraped/', x = bucket_items$key, ignore.case = TRUE)
datatable(bucket_items[subset_criteria,],
caption = "Scraped Data Inventory in S3 Bucket",
rownames = F,
list(pageLength = 5))
Dataframes
df_items <- bucket_items |>
filter(grepl('output', key) & size_KB > 0 & grepl('.csv', key))
df_items
## key last_modify size_KB
## 1 visa_output/df_iv.csv 2025-04-15T23:00:32.000Z 4502
## 2 visa_output/df_niv.csv 2025-04-15T23:00:26.000Z 8370
## 3 visa_output/iv_alltime.csv 2025-04-15T03:29:22.000Z 4484
## 4 visa_output/niv_alltime.csv 2025-04-15T03:29:19.000Z 8289
Read .csv
from S3
df_iv <- s3read_using(read.csv,
object = paste("s3://",aws_credential$bucket,"/visa_output/df_iv.csv",sep=""))
datatable(df_iv[df_iv$time=='2025-02-28' & df_iv$visa=='DV',],
caption = "February 2025 U.S. Diversity Visa",
rownames = NULL, list(pageLength = 5))
Read .txt
country <- s3read_using(FUN=read.delim,
object = paste("s3://",aws_credential$bucket,"/",key_list[grep('country_list.txt', key_list)],sep=""))
tail(country)
## COTE.D.IVOIRE
## 252 OMAN
## 253 SEYCHELLES
## 254 HONG KONG S A R
## 255 SINT MAARTEN
## 256 SERBIA
## 257 SRI LANKA