Dataset¶
A dataset is a unique set of data as defined by a Supplier and Crux. It comprises of a series of data with unique content that is updated (as metadata and file resources) on a given schedule from a Supplier, processed, and stored by Crux.
List subscribed datasets¶
import os
from crux import Crux
if not os.getenv("CRUX_API_KEY"):
raise ValueError("CRUX_API_KEY is unset")
conn = Crux(api_key=os.getenv(“CRUX_API_KEY”), api_host='https://api.cruxinformatics.com')
datasets = conn.list_datasets()
for dataset in datasets:
print('Dataset {} identified by {}'.format(dataset.name, dataset.id))
Get a dataset¶
import os
from crux import Crux
if not os.getenv("CRUX_API_KEY"):
raise ValueError("CRUX_API_KEY is unset")
conn = Crux(api_key=os.getenv(“CRUX_API_KEY”), api_host='https://api.cruxinformatics.com')
dataset = conn.get_dataset("DATASET_ID")
Get the latest Dataset file frames for all subsubscriptions.¶
import os
import logging
import tempfile
from crux import Crux
from crux.models.resource import MediaType
log = logging.getLogger(__name__)
log.setLevel(logging.INFO)
if not os.getenv("CRUX_API_KEY"):
raise ValueError("CRUX_API_KEY is unset")
if not os.getenv("CRUX_DSID"):
raise ValueError("CRUX_DSID is unset")
CRUX_CLIENT = Crux(api_key=os.getenv("CRUX_API_KEY"))
def main():
"""Main Function"""
logging.basicConfig(level=logging.INFO)
for dataset in CRUX_CLIENT.list_datasets():
log.info("Dataset: %s", dataset.name)
file_set = dataset.get_latest_files(
frames=None, # optional str or list[str]
cutoff_date=None, # search up to this date
file_format=MediaType.CSV.value
)
for file in file_set:
local_file_path = os.path.join(tempfile.gettempdir(), file.name)
log.info(" Download %s size=%s", local_file_path, file.size)
file.download(local_file_path)
main()
Fetch Dataset file frames for a selected time range¶
import os
import logging
import tempfile
from crux import Crux
from crux.models.resource import MediaType
log = logging.getLogger(__name__)
log.setLevel(logging.INFO)
if not os.getenv("CRUX_API_KEY"):
raise ValueError("CRUX_API_KEY is unset")
if not os.getenv("CRUX_DSID"):
raise ValueError("CRUX_DSID is unset")
CRUX_CLIENT = Crux(api_key=os.getenv("CRUX_API_KEY"))
def main():
"""Main Function"""
logging.basicConfig(level=logging.INFO)
dataset_id = os.getenv("CRUX_DSID")
dataset = CRUX_CLIENT.get_dataset(dataset_id)
log.info("Dataset: %s", dataset.name)
file_set = dataset.get_files_range(
start_date="2/1/2020",
end_date="2/28/2020",
frames=None, # optional str or list[str]
file_format=MediaType.AVRO.value
)
for file in file_set:
local_file_path = os.path.join(tempfile.gettempdir(), file.name)
log.info(" Download %s size=%s", local_file_path, file.size)
file.download(local_file_path)
main()