Datasets#
lit.sdk.data.datasets
#
This module provides methods for creating datasets that will be utilized during the build process.
StrPath = str | os.PathLike[str]
module
#
Represent a PEP 604 union type
E.g. for int | str
__doc__ = '\nThis module provides methods for creating datasets that will be utilized during the build process.\n'
module
#
str(object='') -> str str(bytes_or_buffer[, encoding[, errors]]) -> str
Create a new string object from the given object. If encoding or errors is specified, then the object must expose a data buffer that will be decoded using the given encoding and error handler. Otherwise, returns the result of object.str() (if defined) or repr(object). encoding defaults to sys.getdefaultencoding(). errors defaults to 'strict'.
__file__ = '/opt/lit-platform/lit-lib/src/lit/sdk/data/datasets.cpython-312-x86_64-linux-gnu.so'
module
#
str(object='') -> str str(bytes_or_buffer[, encoding[, errors]]) -> str
Create a new string object from the given object. If encoding or errors is specified, then the object must expose a data buffer that will be decoded using the given encoding and error handler. Otherwise, returns the result of object.str() (if defined) or repr(object). encoding defaults to sys.getdefaultencoding(). errors defaults to 'strict'.
__name__ = 'lit.sdk.data.datasets'
module
#
str(object='') -> str str(bytes_or_buffer[, encoding[, errors]]) -> str
Create a new string object from the given object. If encoding or errors is specified, then the object must expose a data buffer that will be decoded using the given encoding and error handler. Otherwise, returns the result of object.str() (if defined) or repr(object). encoding defaults to sys.getdefaultencoding(). errors defaults to 'strict'.
__package__ = 'lit.sdk.data'
module
#
str(object='') -> str str(bytes_or_buffer[, encoding[, errors]]) -> str
Create a new string object from the given object. If encoding or errors is specified, then the object must expose a data buffer that will be decoded using the given encoding and error handler. Otherwise, returns the result of object.str() (if defined) or repr(object). encoding defaults to sys.getdefaultencoding(). errors defaults to 'strict'.
__test__ = {'Dataset.__getitem__ (line 137)': '\n Enable subscription (slicing/indexing) of datasets directly.\n Delegates to the adapter\'s __getitem__ method.\n \n Examples:\n >>> ds = Dataset.from_team_and_name("contoso", "nvda")\n >>> data = ds[-10:] # Last 10 records\n >>> data = ds[100:200] # Records 100-199\n >>> data = ds[50] # Single record at index 50\n ', 'list_datasets (line 159)': 'Returns a list of dataset names\n\n Args:\n team (str): The team the datasets belongs to.\n\n\n Returns:\n (list[str]): The collection of dataset names.\n\n Examples:\n >>> list_datasets("contoso")\n ["MSFT", "AAPL", "SPY"]\n ', 'get_dataset (line 178)': 'Returns a dataset by name\n\n Args:\n team (str): The team the datasets belongs to.\n name (str): Name of the dataset to be returned\n\n Returns:\n (dict): The dataset.\n\n Examples:\n >>> get_dataset("contoso", "my_ds")\n {\n "name": "my_ds",\n "raw": ["/data/contoso/raw/sample.csv.gz"],\n "events": [{\n "type": "Added raw",\n "detail": "/data/contoso/raw/sample.csv.gz",\n "timestamp": 11729182007,\n "username": "lit_user"\n },\n {\n "type": "init",\n "detail": "began work on my_ds",\n "timestamp": 1729181556,\n "username": "lit_user"\n }],\n }\n ', 'init_dataset (line 213)': 'Create a new dataset\n\n Args:\n team (str): The team the dataset belongs to.\n name (str): Name of the new dataset.\n\n Returns:\n (dict): The dataset.\n\n Examples:\n >>> init_dataset("contoso", "my_ds")\n {\n "name": "my_ds",\n "raw": [],\n "events": [{\n "type": "init",\n "detail": "began work on my_ds",\n "timestamp": 1729181556,\n "username": "lit_user"\n }],\n }\n ', 'add_path_to_dataset (line 242)': 'Add path to existing dataset\n\n Args:\n team (str): The team the dataset belongs to.\n name (str): Name of the dataset the path is to be added.\n path (str): Path of file to be added to dataset.\n\n Returns:\n (dict): The dataset.\n\n Examples:\n >>> add_path_to_dataset("contoso", "my_ds", "/data/contoso/raw/sample.csv.gz")\n {\n "name": "my_ds",\n "raw": ["/data/contoso/raw/sample.csv.gz"],\n "events": [{\n "type": "Added raw",\n "detail": "/data/contoso/raw/sample.csv.gz",\n "timestamp": 11729182007,\n "username": "lit_user"\n },\n {\n "type": "init",\n "detail": "began work on my_ds",\n "timestamp": 1729181556,\n "username": "lit_user"\n }],\n }\n ', 'remove_path_to_dataset (line 279)': 'Remove path from existing dataset\n\n Args:\n team (str): The team the dataset belongs to.\n name (str): Name of the dataset the path is to be removed.\n path (str): Path of file to be removed to dataset.\n\n Returns:\n (dict): The dataset.\n\n Examples:\n >>> remove_path_to_dataset("contoso", "my_ds", "/data/contoso/raw/sample.csv.gz")\n {\n "name": "my_ds",\n "raw": ["/data/contoso/raw/sample.csv.gz"],\n "events": [{\n "type": "Removed raw",\n "detail": "/data/contoso/raw/sample.csv.gz",\n "timestamp": 1729184479,\n "username": "lit_user"\n },\n {\n "type": "Added raw",\n "detail": "/data/contoso/raw/sample.csv.gz",\n "timestamp": 11729182007,\n "username": "lit_user"\n },\n {\n "type": "init",\n "detail": "began work on my_ds",\n "timestamp": 1729181556,\n "username": "lit_user"\n }],\n }\n ', 'get_sample_count (line 322)': '\n Retrieves the sample count for a specified dataset within a team.\n\n Args:\n team_name (str): The name of the team.\n name (str): The name of the dataset.\n\n Returns:\n (int): The sample count for the specified dataset.\n\n Examples:\n >>> get_sample_count("contoso", "my_ds")\n 52042581\n ', 'get_data (line 343)': '\n Retrieves data for a specified dataset within a team over a given range.\n\n This function fetches data between the `start` and `stop` indices for the given dataset.\n The returned data is either a JSON string or a dictionary. If the data is a JSON string,\n it is parsed into a dictionary before being returned.\n\n Args:\n team_name (str): The name of the team.\n name (str): The name of the dataset.\n start (int): The starting index for the data retrieval.\n stop (int): The stopping index for the data retrieval.\n\n Returns:\n (dict): The data for the specified dataset and range, parsed as a dictionary.\n\n Raises:\n TypeError: If the returned data is not of type \'str\' or \'dict\'.\n\n Examples:\n >>> get_data("contoso", "my_ds", 0, 100)\n {...}\n ', 'get_data_by_date (line 379)': '_summary_\n\n Args:\n team_name (str): The name of the team.\n name (str): The name of the dataset.\n timestamp (float): The timestamp around which data is to be retrieved.\n aperture (int): The number of samples to retrieve on either side of the timestamp.\n\n Returns:\n dict: The data around the specified timestamp with the given aperture.\n\n Examples:\n >>> get_data_by_date("contoso", "my_ds", 1494858825, 10000)\n {...}\n ', 'demo (line 401)': '\n Runs a feature demonstration on the specified dataset and returns the result.\n\n Args:\n team_name (str): The name of the team.\n name (str): The name of the dataset.\n feature_path (str): The path to the feature to test.\n index (int): The data index within the dataset to use for the demo.\n params (dict): A set of parameters to pass to the feature script.\n\n Returns:\n (dict): The result of the feature demonstration; the timestamp, return data from the feature, and any UI hints.\n\n Examples:\n >>> demo(\n ... "contoso",\n ... "my_ds",\n ... "/data/contoso/features/ohlcv.py",\n ... 19562810,\n ... {"count": 5, "size": 1, "unit": "hour"},\n ... )\n {\'timestamp\': 1493994825045691315,\n \'data\': array([[2.38490005e+02, 2.38559998e+02, 2.33226593e+02, 2.38500000e+02,\n 3.71410000e+04, 8.73224400e+06, 2.35110626e+02],\n [2.38500000e+02, 2.38660004e+02, 2.38300003e+02, 2.38520004e+02,\n 2.33910000e+04, 4.86324900e+06, 2.07911118e+02],\n [2.38528900e+02, 2.38770004e+02, 2.38210007e+02, 2.38500000e+02,\n 2.87640000e+04, 6.10002200e+06, 2.12071411e+02],\n [2.38500000e+02, 2.38798996e+02, 2.38399994e+02, 2.38740005e+02,\n 3.50160000e+04, 7.89611100e+06, 2.25500092e+02],\n [2.39190002e+02, 2.39309998e+02, 2.38839996e+02, 2.38860001e+02,\n 2.14480000e+04, 4.43271800e+06, 2.06672791e+02]]),\n \'hints\': {}}\n ', 'estimate (line 441)': '\n Estimates feature data for a specified dataset.\n\n Args:\n team_name (str): The name of the team.\n name (str): The name of the dataset.\n feature_path (str): The path to the feature script.\n count (int): The number of samples to estimate.\n params (dict): A set of parameters to pass to the feature script.\n\n Returns:\n (NDArray): The estimated feature data as a NumPy array.\n\n Examples:\n >>> estimate(\n ... "contoso",\n ... "spy",\n ... "/data/contoso/features/ohlcv.py",\n ... 5,\n ... {"count": 5, "size": 1, "unit": "hour"},\n ... )\n array([2.43907004e+02, 2.44300000e+02, 2.43257996e+02, 2.43632999e+02,\n 7.06504000e+04, 1.72431610e+07, 2.39403308e+02])\n '}
module
#
dict() -> new empty dictionary dict(mapping) -> new dictionary initialized from a mapping object's (key, value) pairs dict(iterable) -> new dictionary initialized as if via: d = {} for k, v in iterable: d[k] = v dict(**kwargs) -> new dictionary initialized with the name=value pairs in the keyword argument list. For example: dict(one=1, two=2)
BaseAdapter
#
get_bar_daterange(date, unit='tick', size=1, future=False)
cached
#
Returns#
[ price_open, price_high, price_low, price_close, trade_count, vol_sum, vol_mean, vol_max, timestamp ]
Parameters#
date : pd.Timestamp, optional The day to begin retrieve bars
str, optional
The unit of the bars; one of "tick", "sec", "min", "hour", "day"
boolean, optional
By default, look back into the past (computing features) to retrieve bars. Override to 'True' to retrieve from the future (computing labels).
get_bars(index=None, count=1, unit='tick', size=1, future=False, prevent_lookahead_bias=True)
cached
#
Returns#
[ price_open, price_high, price_low, price_close, trade_count, vol_sum, vol_mean, vol_max ]
Parameters#
index : int, optional The index at which to begin retrieving bars
int, optional
The number of bars to retrieve
str, optional
The unit of the bars; one of "tick", "sec", "min", "hour", "day"
boolean, optional
By default, look back into the past (computing features) to retrieve bars. Override to 'True' to retrieve from the future (computing labels).
get_date(date)
#
Get all data for a single trading date.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
date
|
A date that can be converted to pd.Timestamp (string, datetime, pd.Timestamp) |
required |
Returns:
| Type | Description |
|---|---|
|
DataFrame with all records for that trading date |
Examples:
get_max_values()
#
returns open, high, low, close, count, vol_sum, vol_sum / count, vol_max,
get_minute_bars(date, size=1, future=False)
#
Get precomputed minute bars for a trading date using get_bar_daterange.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
date
|
A date that can be converted to pd.Timestamp |
required | |
size
|
Bar size in minutes (default 1 for 1-minute bars) |
1
|
|
future
|
If False (default), look back into past (features). If True, look forward (labels). |
False
|
Returns:
| Type | Description |
|---|---|
|
DataFrame with precomputed bars: [open, high, low, close, count, vol_sum, vol_mean, vol_max, timestamp] |
Examples:
Dataset
#
__annotations__ = {'team_name': 'str', 'name': 'str', 'raw': 'set[Path]', 'events': 'list[DatasetEvent]', 'attrs': 'dict[str, Any]', 'path': 'Path', '_Dataset__cached_adapter': 'BaseAdapter | None'}
class
#
dict() -> new empty dictionary dict(mapping) -> new dictionary initialized from a mapping object's (key, value) pairs dict(iterable) -> new dictionary initialized as if via: d = {} for k, v in iterable: d[k] = v dict(**kwargs) -> new dictionary initialized with the name=value pairs in the keyword argument list. For example: dict(one=1, two=2)
__module__ = 'lit.sdk.data.datasets'
class
#
str(object='') -> str str(bytes_or_buffer[, encoding[, errors]]) -> str
Create a new string object from the given object. If encoding or errors is specified, then the object must expose a data buffer that will be decoded using the given encoding and error handler. Otherwise, returns the result of object.str() (if defined) or repr(object). encoding defaults to sys.getdefaultencoding(). errors defaults to 'strict'.
__weakref__
property
#
list of weak references to the object
__getitem__(key)
method descriptor
#
DatasetEvent
#
Bases: builtins.dict
This class defines the structure for an event within a dataset.
__annotations__ = {'type': ForwardRef('str', module='lit.sdk.data.datasets'), 'detail': ForwardRef('str', module='lit.sdk.data.datasets'), 'timestamp': ForwardRef('float', module='lit.sdk.data.datasets'), 'username': ForwardRef('str', module='lit.sdk.data.datasets')}
class
#
dict() -> new empty dictionary dict(mapping) -> new dictionary initialized from a mapping object's (key, value) pairs dict(iterable) -> new dictionary initialized as if via: d = {} for k, v in iterable: d[k] = v dict(**kwargs) -> new dictionary initialized with the name=value pairs in the keyword argument list. For example: dict(one=1, two=2)
__doc__ = '\n This class defines the structure for an event within a dataset.\n '
class
#
str(object='') -> str str(bytes_or_buffer[, encoding[, errors]]) -> str
Create a new string object from the given object. If encoding or errors is specified, then the object must expose a data buffer that will be decoded using the given encoding and error handler. Otherwise, returns the result of object.str() (if defined) or repr(object). encoding defaults to sys.getdefaultencoding(). errors defaults to 'strict'.
__module__ = 'lit.sdk.data.datasets'
class
#
str(object='') -> str str(bytes_or_buffer[, encoding[, errors]]) -> str
Create a new string object from the given object. If encoding or errors is specified, then the object must expose a data buffer that will be decoded using the given encoding and error handler. Otherwise, returns the result of object.str() (if defined) or repr(object). encoding defaults to sys.getdefaultencoding(). errors defaults to 'strict'.
__optional_keys__ = frozenset()
class
#
frozenset() -> empty frozenset object frozenset(iterable) -> frozenset object
Build an immutable unordered collection of unique elements.
__orig_bases__ = (<function TypedDict at 0x7fef89f62e80>,)
class
#
Built-in immutable sequence.
If no argument is given, the constructor returns an empty tuple. If iterable is specified the tuple is initialized from iterable's items.
If the argument is a tuple, the return value is the same object.
__required_keys__ = frozenset({'detail', 'username', 'type', 'timestamp'})
class
#
frozenset() -> empty frozenset object frozenset(iterable) -> frozenset object
Build an immutable unordered collection of unique elements.
__total__ = True
class
#
bool(x) -> bool
Returns True when the argument x is true, False otherwise. The builtins True and False are the only two instances of the class bool. The class bool is a subclass of the class int, and cannot be subclassed.
__weakref__
property
#
list of weak references to the object
LitDataError
#
Bases: lit.sdk.errors.LitError
A custom exception class for Lit data errors.
__annotations__ = {}
class
#
dict() -> new empty dictionary dict(mapping) -> new dictionary initialized from a mapping object's (key, value) pairs dict(iterable) -> new dictionary initialized as if via: d = {} for k, v in iterable: d[k] = v dict(**kwargs) -> new dictionary initialized with the name=value pairs in the keyword argument list. For example: dict(one=1, two=2)
__doc__ = 'A custom exception class for Lit data errors.'
class
#
str(object='') -> str str(bytes_or_buffer[, encoding[, errors]]) -> str
Create a new string object from the given object. If encoding or errors is specified, then the object must expose a data buffer that will be decoded using the given encoding and error handler. Otherwise, returns the result of object.str() (if defined) or repr(object). encoding defaults to sys.getdefaultencoding(). errors defaults to 'strict'.
__module__ = 'lit.sdk.errors'
class
#
str(object='') -> str str(bytes_or_buffer[, encoding[, errors]]) -> str
Create a new string object from the given object. If encoding or errors is specified, then the object must expose a data buffer that will be decoded using the given encoding and error handler. Otherwise, returns the result of object.str() (if defined) or repr(object). encoding defaults to sys.getdefaultencoding(). errors defaults to 'strict'.
__init__(*args, code=None)
method descriptor
#
Initializes a new instance of LitDataError.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
*args
|
object
|
Variable number of arguments to pass to the base class constructor. |
required |
code
|
int | None
|
The error code. Defaults to None. |
None
|
demo(team_name, name, feature_path, index, params)
method descriptor
#
Runs a feature demonstration on the specified dataset and returns the result.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
team_name
|
str
|
The name of the team. |
required |
name
|
str
|
The name of the dataset. |
required |
feature_path
|
str
|
The path to the feature to test. |
required |
index
|
int
|
The data index within the dataset to use for the demo. |
required |
params
|
dict
|
A set of parameters to pass to the feature script. |
required |
Returns:
| Type | Description |
|---|---|
dict
|
The result of the feature demonstration; the timestamp, return data from the feature, and any UI hints. |
Examples:
>>> demo(
... "contoso",
... "my_ds",
... "/data/contoso/features/ohlcv.py",
... 19562810,
... {"count": 5, "size": 1, "unit": "hour"},
... )
{'timestamp': 1493994825045691315,
'data': array([[2.38490005e+02, 2.38559998e+02, 2.33226593e+02, 2.38500000e+02,
3.71410000e+04, 8.73224400e+06, 2.35110626e+02],
[2.38500000e+02, 2.38660004e+02, 2.38300003e+02, 2.38520004e+02,
2.33910000e+04, 4.86324900e+06, 2.07911118e+02],
[2.38528900e+02, 2.38770004e+02, 2.38210007e+02, 2.38500000e+02,
2.87640000e+04, 6.10002200e+06, 2.12071411e+02],
[2.38500000e+02, 2.38798996e+02, 2.38399994e+02, 2.38740005e+02,
3.50160000e+04, 7.89611100e+06, 2.25500092e+02],
[2.39190002e+02, 2.39309998e+02, 2.38839996e+02, 2.38860001e+02,
2.14480000e+04, 4.43271800e+06, 2.06672791e+02]]),
'hints': {}}
estimate(team_name, name, feature_path, count, params)
method descriptor
#
Estimates feature data for a specified dataset.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
team_name
|
str
|
The name of the team. |
required |
name
|
str
|
The name of the dataset. |
required |
feature_path
|
str
|
The path to the feature script. |
required |
count
|
int
|
The number of samples to estimate. |
required |
params
|
dict
|
A set of parameters to pass to the feature script. |
required |
Returns:
| Type | Description |
|---|---|
ndarray
|
The estimated feature data as a NumPy array. |
Examples:
lit_error_handler(error_type=LitError)
method descriptor
#
A decorator function that catches and re-raises exceptions with a custom error type.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
error_type
|
Type[LitError]
|
The type of exception to raise. Defaults to LitError. |
LitError
|
Returns:
| Type | Description |
|---|---|
Callable
|
A wrapper function that catches exceptions and raises the specified error type. |
Examples:
>>> @lit_error_handler()
... def raises_error():
... assert False
...
>>> raises_error()
Traceback (most recent call last):
File "/opt/lit/src/lit/sdk/errors.py", line 130, in wrapper
File "<stdin>", line 3, in raises_error
AssertionError
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/opt/lit/src/lit/sdk/errors.py", line 132, in wrapper
lit.sdk.errors.LitError: An error occurred in function 'raises_error'