Establish S3 Connection
Explore Objects in Bucket
- Create data catalog
- List selected objects
Dataframes
- Read .csv from S3
- Read .txt

Establish S3 Connection

# install.packages("aws.s3")
# install.packages("rjson")
library(aws.s3)
library(rstudioapi) #askForSecret
library(rjson)
library(knitr) # opts_knit$set(root.dir = "~/path") set directory for the entire workbook
library(DT)
library(dplyr)

aws_credential <- fromJSON(file='aws_credential.txt')

Sys.setenv("AWS_ACCESS_KEY_ID" = aws_credential$access_key,
           "AWS_SECRET_ACCESS_KEY" = aws_credential$secret_key,
           "AWS_DEFAULT_REGION" = "us-east-2")
bucket_info <- get_bucket(aws_credential$bucket, region = 'us-east-2')

Explore Objects in Bucket

Create data catalog

names(bucket_info$Contents)

## [1] "Key"               "LastModified"      "ETag"             
## [4] "ChecksumAlgorithm" "ChecksumType"      "Size"             
## [7] "Owner"             "StorageClass"      "Bucket"

length(bucket_info)

## [1] 285

key_list <- vector(mode = 'character',length(bucket_info))
modify_time_list <- vector(mode = 'character',length(bucket_info))
size_list <- vector(mode = 'character', length(bucket_info))
for (i in 1:length(bucket_info))
  {
    key_list[i] <- bucket_info[[i]]$Key
    modify_time_list[i] <- bucket_info[[i]]$LastModified
    size_list[i] <- round(bucket_info[[i]]$Size / 1024)
  }
bucket_items <- data.frame(key = key_list, last_modify = modify_time_list, size_KB = size_list)

List selected objects

subset_criteria <- grepl(pattern = 'visa_scraped/', x = bucket_items$key, ignore.case = TRUE)
datatable(bucket_items[subset_criteria,], 
          caption = "Scraped Data Inventory in S3 Bucket", 
          rownames = F, 
          list(pageLength = 5))

Dataframes

df_items <- bucket_items |>
  filter(grepl('output', key) & size_KB > 0 & grepl('.csv', key))
df_items

##                           key              last_modify size_KB
## 1       visa_output/df_iv.csv 2025-04-15T23:00:32.000Z    4502
## 2      visa_output/df_niv.csv 2025-04-15T23:00:26.000Z    8370
## 3  visa_output/iv_alltime.csv 2025-04-15T03:29:22.000Z    4484
## 4 visa_output/niv_alltime.csv 2025-04-15T03:29:19.000Z    8289

Read `.csv` from S3

df_iv <- s3read_using(read.csv, 
                      object = paste("s3://",aws_credential$bucket,"/visa_output/df_iv.csv",sep=""))

datatable(df_iv[df_iv$time=='2025-02-28' & df_iv$visa=='DV',], 
          caption = "February 2025 U.S. Diversity Visa", 
          rownames = NULL, list(pageLength = 5))

Read `.txt`

country <- s3read_using(FUN=read.delim, 
                      object = paste("s3://",aws_credential$bucket,"/",key_list[grep('country_list.txt', key_list)],sep=""))
tail(country)

##       COTE.D.IVOIRE
## 252            OMAN
## 253      SEYCHELLES
## 254 HONG KONG S A R
## 255    SINT MAARTEN
## 256          SERBIA
## 257       SRI LANKA

aws_s3

Tiangeng Lu

2025-04-16

Establish S3 Connection

Explore Objects in Bucket

Create data catalog

List selected objects

Dataframes

Read `.csv` from S3

Read `.txt`

aws_s3

Tiangeng Lu

2025-04-16

Establish S3 Connection

Explore Objects in Bucket

Create data catalog

List selected objects

Dataframes

Read .csv from S3

Read .txt

Read `.csv` from S3

Read `.txt`