Multi-Table Datasets
Accessing multi-table datasets on Dewey via API
Currently, accessing multi-table products is not available in the deweydatapy
(Python) or deweydataR
(R) libraries. To work around this, you can use the provided code snippets to:
- Python: Create a Multitable class with methods to download data from tables in a multi-table dataset.
- R: Utilize separate functions that perform similar tasks to handle multi-table datasets.
import random
import requests
import time
import os
class Multitable:
def __init__(self, apikey, pp_):
self.apikey = apikey
self.pp_ = pp_
def make_get_request(self, url, params = None, attempt_count = 1, max_attempts = 5):
"""
Call API using GET request, including retry logic
param url: API endpoint
param params: parameters for the API call
"""
headers = {'X-API-KEY': self.apikey,
'Content-Type': 'application/json'}
response = requests.get(url, params=params, headers=headers)
if response.status_code == 429 and attempt_count <= max_attempts:
sleep_time = (2 ** attempt_count) + random.uniform(0, 2)
print(f'Ratelimit reached, retrying after {sleep_time:.1f} seconds')
time.sleep(sleep_time)
return self.make_get_request(url, params=params, attempt_count=attempt_count+1)
if response.status_code == 400:
raise Exception(f'Request failed with {response.status_code}')
if not (response.status_code >= 200 and response.status_code < 300):
raise Exception(f'Request failed with {response.status_code} and {response.json()}')
return response
def get_table_list(self):
"""
Get a list of tables available for download from a multi-table product.
"""
results = self.make_get_request(url=self.pp_+'/multi-table-product-metadata')
table_options = [table_dict['table_name'] for table_dict in results.json()['items']]
return table_options
def get_multi_tables(self, table_list, directory):
"""
Download all files from a list of tables from a multi-table product.
:param table_list: list of tables to download. Call get_table_list() to get a list of available tables.
:param directory: directory to save downloaded files
"""
for table_name in table_list:
# make folder for table if it doesn't exist
table_dir = os.path.join(directory, table_name)
if not os.path.exists(table_dir):
os.makedirs(table_dir)
# loop through all API result pages, keeping track of number of downloaded files
for table_name in table_list:
page = 1
download_count = 0
while True:
# get results from API endpoint, using API key for authentication, for a specific page
results = self.make_get_request(
url=self.pp_,
params={'page': page, 'table_name': table_name}
)
response_json = results.json()
# for each result page, loop through download links and save to your computer
for link_data in response_json['download_links']:
file_name = link_data['file_name']
print(f"Downloading file {file_name} to folder {table_name}...")
data = self.make_get_request(link_data['link'])
with open(os.path.join(table_name, file_name), 'wb') as file:
file.write(data.content)
download_count += 1
# only continue if there are more result pages to process
total_pages = response_json['total_pages']
if page >= total_pages:
break
page += 1
print(f"Successfully downloaded {download_count} files to folder {table_name}.")
library(httr)
library(jsonlite)
library(stringr)
# Call API using GET request, including retry logic
make_get_request <- function(apikey, url, params = list(), attempt_count = 1, max_attempts = 5) {
# Set headers
headers <- add_headers('X-API-KEY' = apikey,
'Content-Type' = 'application/json')
# Make GET request
response <- GET(url, query = params, headers)
# Check for rate limit and retry if necessary
if (status_code(response) == 429 && attempt_count <= max_attempts) {
sleep_time <- (2 ^ attempt_count) + runif(1, 0, 2)
message(sprintf('Ratelimit reached, retrying after %.1f seconds', sleep_time))
Sys.sleep(sleep_time)
return(make_get_request(apikey, url, params, attempt_count + 1, max_attempts))
}
# Check for bad request and handle errors
if (status_code(response) == 400) {
stop(sprintf('Request failed with %d', status_code(response)))
}
if (!(status_code(response) >= 200 && status_code(response) < 300)) {
stop(sprintf('Request failed with %d and %s', status_code(response), content(response, "text")))
}
return(response)
}
# Get a list of tables available for download from a multi-table product.
get_table_list <- function(apikey, pp_) {
# Get a list of tables available for download
url <- paste0(pp_, '/multi-table-product-metadata')
response <- make_get_request(apikey, url)
response_json <- content(response, as = "parsed")
table_options <- sapply(response_json$items, function(x) x$table_name)
return(table_options)
}
# Download all files from a list of tables from a multi-table product.
get_multi_tables <- function(apikey, pp_, table_list, directory) {
# Download all files from a list of tables
for (table_name in table_list) {
# Make folder for table if it doesn't exist
table_dir <- file.path(directory, table_name)
if (!dir.exists(table_dir)) {
dir.create(table_dir, recursive = TRUE)
}
page <- 1
download_count <- 0
repeat {
# Get results from API endpoint for a specific page
results <- make_get_request(apikey, pp_, list(page = page, table_name = table_name))
response_json <- content(results, as = "parsed")
# Loop through download links and save to computer
for (link_data in response_json$download_links) {
file_name <- link_data$file_name
message(sprintf("Downloading file %s to folder %s...", file_name, table_name))
data <- make_get_request(apikey, link_data$link)
# Save the content to a file
writeBin(content(data, "raw"), file.path(table_dir, file_name))
download_count <- download_count + 1
}
# Check if there are more pages to process
total_pages <- response_json$total_pages
if (page >= total_pages) {
break
}
page <- page + 1
}
message(sprintf("Successfully downloaded %d files to folder %s.", download_count, table_name))
}
}
Example use
Set up
# API Key
apikey = "<API_KEY_STEP_1>"
# Product path
pp_ = "<PRODUCT_PATH_STEP_2>"
# Initialize the class and store in variable mtp
mtp = Multitable(apikey, pp_)
# API Key
apikey <- "<API_KEY_STEP_1>"
# Product path
pp_ <- "<PRODUCT_PATH_STEP_2>"
Get a list of available tables and download
# Show all tables available
tables = mtp.get_table_list()
print(tables)
# Download all tables in the list
mtp.get_multi_tables(tables, "<YOUR_DIRECTORY>")
# Show all tables available
tables <- get_table_list(apikey, pp_)
print(tables)
get_multi_tables(apikey, pp_, tables, "<YOUR_DIRECTORY>")
Download select tables from the list
# Define a subset from the list of tables available
table_subset = ['TABLE_1', 'TABLE_2']
# Download the tables in the list
mtp.get_multi_tables(table_subset, "<YOUR_DIRECTORY>")
table_subset <- ['TABLE_1', 'TABLE_2']
get_multi_tables(apikey, pp_, table_subset, "<YOUR_DIRECTORY>")
Updated 16 days ago