Multi-Table Datasets

Accessing multi-table datasets on Dewey via API

Currently, accessing multi-table products is not available in the deweydatapy (Python) or deweydataR (R) libraries. To work around this, you can use the provided code snippets to:

  • Python: Create a Multitable class with methods to download data from tables in a multi-table dataset.
  • R: Utilize separate functions that perform similar tasks to handle multi-table datasets.
import random
import requests
import time
import os

class Multitable:
    def __init__(self, apikey, pp_):
        self.apikey = apikey
        self.pp_ = pp_
    
    def make_get_request(self, url, params = None, attempt_count = 1, max_attempts = 5):
        """
        Call API using GET request, including retry logic
        param url: API endpoint
        param params: parameters for the API call
        """
        headers = {'X-API-KEY': self.apikey,
                   'Content-Type': 'application/json'}

        response = requests.get(url, params=params, headers=headers)

        if response.status_code == 429 and attempt_count <= max_attempts:
            sleep_time = (2 ** attempt_count) + random.uniform(0, 2)
            print(f'Ratelimit reached, retrying after {sleep_time:.1f} seconds')
            time.sleep(sleep_time)
            return self.make_get_request(url, params=params, attempt_count=attempt_count+1)

        if response.status_code == 400:
            raise Exception(f'Request failed with {response.status_code}')

        if not (response.status_code >= 200 and response.status_code < 300):
            raise Exception(f'Request failed with {response.status_code} and {response.json()}')

        return response
    
    
    def get_table_list(self):
        """
        Get a list of tables available for download from a multi-table product.
        """
        results = self.make_get_request(url=self.pp_+'/multi-table-product-metadata')
        table_options = [table_dict['table_name'] for table_dict in results.json()['items']]
        return table_options
    
    
    def get_multi_tables(self, table_list, directory):
        """
        Download all files from a list of tables from a multi-table product.
        :param table_list: list of tables to download. Call get_table_list() to get a list of available tables.
        :param directory: directory to save downloaded files
        """
        for table_name in table_list:
        # make folder for table if it doesn't exist
            table_dir = os.path.join(directory, table_name)

            if not os.path.exists(table_dir): 
                os.makedirs(table_dir)
        
        # loop through all API result pages, keeping track of number of downloaded files
        for table_name in table_list:
            page = 1
            download_count = 0
            while True:
                # get results from API endpoint, using API key for authentication, for a specific page
                results = self.make_get_request(
                    url=self.pp_,
                    params={'page': page, 'table_name': table_name}
                )
                response_json = results.json()

                # for each result page, loop through download links and save to your computer
                for link_data in response_json['download_links']:
                        file_name = link_data['file_name']
                        print(f"Downloading file {file_name} to folder {table_name}...")
                        data = self.make_get_request(link_data['link'])
                        with open(os.path.join(table_name, file_name), 'wb') as file:
                            file.write(data.content)
                        download_count += 1
                
                # only continue if there are more result pages to process
                total_pages = response_json['total_pages']
                if page >= total_pages:
                    break
                page += 1
            
            print(f"Successfully downloaded {download_count} files to folder {table_name}.")
library(httr)
library(jsonlite)
library(stringr)


# Call API using GET request, including retry logic
make_get_request <- function(apikey, url, params = list(), attempt_count = 1, max_attempts = 5) {
  # Set headers
  headers <- add_headers('X-API-KEY' = apikey,
                         'Content-Type' = 'application/json')

  # Make GET request
  response <- GET(url, query = params, headers)

  # Check for rate limit and retry if necessary
  if (status_code(response) == 429 && attempt_count <= max_attempts) {
    sleep_time <- (2 ^ attempt_count) + runif(1, 0, 2)
    message(sprintf('Ratelimit reached, retrying after %.1f seconds', sleep_time))
    Sys.sleep(sleep_time)
    return(make_get_request(apikey, url, params, attempt_count + 1, max_attempts))
  }

  # Check for bad request and handle errors
  if (status_code(response) == 400) {
    stop(sprintf('Request failed with %d', status_code(response)))
  }

  if (!(status_code(response) >= 200 && status_code(response) < 300)) {
    stop(sprintf('Request failed with %d and %s', status_code(response), content(response, "text")))
  }

  return(response)
}


# Get a list of tables available for download from a multi-table product.
get_table_list <- function(apikey, pp_) {
  # Get a list of tables available for download
  url <- paste0(pp_, '/multi-table-product-metadata')
  response <- make_get_request(apikey, url)
  response_json <- content(response, as = "parsed")
  table_options <- sapply(response_json$items, function(x) x$table_name)
  return(table_options)
}


# Download all files from a list of tables from a multi-table product.
get_multi_tables <- function(apikey, pp_, table_list, directory) {
  # Download all files from a list of tables
  for (table_name in table_list) {
    # Make folder for table if it doesn't exist
    table_dir <- file.path(directory, table_name)
    if (!dir.exists(table_dir)) {
      dir.create(table_dir, recursive = TRUE)
    }

    page <- 1
    download_count <- 0
    repeat {
      # Get results from API endpoint for a specific page
      results <- make_get_request(apikey, pp_, list(page = page, table_name = table_name))
      response_json <- content(results, as = "parsed")

      # Loop through download links and save to computer
      for (link_data in response_json$download_links) {
        file_name <- link_data$file_name
        message(sprintf("Downloading file %s to folder %s...", file_name, table_name))
        data <- make_get_request(apikey, link_data$link)
        
        # Save the content to a file
        writeBin(content(data, "raw"), file.path(table_dir, file_name))
        download_count <- download_count + 1
      }

      # Check if there are more pages to process
      total_pages <- response_json$total_pages
      if (page >= total_pages) {
        break
      }
      page <- page + 1
    }

    message(sprintf("Successfully downloaded %d files to folder %s.", download_count, table_name))
  }
}

Example use

Set up

# API Key
apikey = "<API_KEY_STEP_1>"

# Product path
pp_ = "<PRODUCT_PATH_STEP_2>"

# Initialize the class and store in variable mtp
mtp = Multitable(apikey, pp_)
# API Key
apikey <- "<API_KEY_STEP_1>"

# Product path
pp_ <- "<PRODUCT_PATH_STEP_2>"

Get a list of available tables and download

# Show all tables available
tables =  mtp.get_table_list()
print(tables)

# Download all tables in the list
mtp.get_multi_tables(tables, "<YOUR_DIRECTORY>")
# Show all tables available
tables <-  get_table_list(apikey, pp_)
print(tables)

get_multi_tables(apikey, pp_, tables, "<YOUR_DIRECTORY>")

Download select tables from the list

# Define a subset from the list of tables available
table_subset = ['TABLE_1', 'TABLE_2']

# Download the tables in the list
mtp.get_multi_tables(table_subset, "<YOUR_DIRECTORY>")
table_subset <-  ['TABLE_1', 'TABLE_2']

get_multi_tables(apikey, pp_, table_subset, "<YOUR_DIRECTORY>")