Metadata
Qubed includes the ability to store metadata which may vary for each individual leaf node. This is achieves by ‘hanging’ arrays at various points in the tree all the way down to the leaf nodes.
from qubed import Qube
example = Qube.load("../tests/example_qubes/extremes-dt_with_metadata.json")
example.html(depth=1)
root, class=d1, dataset=extremes-dt
├── date=2024-04-04T00:00/2024-04-05T00:00/2024-04-07T..., expver=0001, stream=oper, time=0000
│ ├── levtype=hl, type=fc, levelist=100, param=228246/228247, step=0/1/10/11/12/13/14/15/16/17/18/19/2/20/21/22/...│ ├── levtype=pl, type=fc, levelist=1/10/100/1000/150/2/20/200/250/3/30/300/4..., param=129/130/131/132/133/157, step=0/1/10/11/12/13/14/15/16/17/18/19/2/20/21/22/...│ └── levtype=sfc, type=fc
│ ├── param=31/34/78/134/136/137/151/165/166/167/168/302..., step=0/1/10/11/12/13/14/15/16/17/18/19/2/20/21/22/...│ ├── param=142/144/169/175/176/177/178/179/180/181/205/..., step=0-1/1-2/10-11/11-12/12-13/13-14/14-15/15-16/1...│ └── param=228058, step=0-6/12-18/18-24/24-30/30-36/36-42/42-48/48-54...├── date=2024-04-06T00:00, expver=0001, stream=oper, time=0000
│ ├── levtype=hl, type=fc, levelist=100, param=228246/228247, step=0/1/10/11/12/13/14/15/16/17/18/19/2/20/21/22/...│ ├── levtype=pl, type=fc, levelist=1/10/100/1000/150/2/20/200/250/3/30/300/4..., param=130/131/132/133/157, step=0/1/10/11/12/13/14/15/16/17/18/19/2/20/21/22/...│ └── levtype=sfc, type=fc
│ ├── param=31/34/78/134/136/137/151/165/166/167/168/302..., step=0/1/10/11/12/13/14/15/16/17/18/19/2/20/21/22/...│ ├── param=142/144/169/175/176/177/178/179/180/181/205/..., step=0-1/1-2/10-11/11-12/12-13/13-14/14-15/15-16/1...│ └── param=228058, step=0-6/12-18/18-24/24-30/30-36/36-42/42-48/48-54...├── date=2024-04-13T00:00, expver=0001, stream=oper, time=0000
│ ├── levtype=pl, type=fc, levelist=1/10/100/1000/150/2/20/200/250/3/30/300/4..., param=129/130/131/132/133/157, step=0/1/10/11/12/13/14/15/16/17/18/19/2/20/21/22/...│ └── levtype=sfc, type=fc
│ ├── param=31/34/78/134/136/137/151/165/166/167/168/302..., step=0/1/10/11/12/13/14/15/16/17/18/19/2/20/21/22/...│ ├── param=142/144/169/175/176/177/178/179/180/181/205/..., step=0-1/1-2/10-11/11-12/12-13/13-14/14-15/15-16/1...│ └── param=228058, step=0-6/12-18/18-24/24-30/30-36/36-42/42-48/48-54...├── date=2024-05-02T00:00, expver=0001, stream=oper, time=0000, levtype=pl, type=fc, levelist=1/10/100/1000/150/2/20/200/250/3/30/300/4..., param=129/130/131/132/133/157, step=0/1/10/11/12/13/14/15/16/17/18/19/2/20/21/22/...├── date=2024-04-22T00:00, expver=0001, stream=oper, time=0000
│ ├── levtype=hl, type=fc, levelist=100, param=228246/228247, step=0/1/10/11/12/13/14/15/16/17/18/19/2/20/21/22/...│ ├── levtype=pl, type=fc, levelist=1/10/100/1000/150/2/20/200/250/3/30/300/4..., param=129/130/131/132/133/157, step=0/1/10/11/12/13/14/15/16/17/18/19/2/20/21/22/...│ └── levtype=sfc, type=fc
│ ├── param=31/34/78/134/136/137/151/165/166/167/168/302..., step=0/1/10/11/12/13/14/15/16/17/18/19/2/20/21/22/...│ └── param=142/144/169/175/176/177/178/179/180/181/205/..., step=0-1/1-2/10-11/11-12/12-13/13-14/14-15/15-16/1...├── date=2024-05-19T00:00, expver=0001, stream=oper, time=0000
│ ├── levtype=hl, type=fc, levelist=100, param=228246/228247, step=0/1/10/11/12/13/14/15/16/17/18/19/2/20/21/22/...│ ├── levtype=pl, type=fc, levelist=1/10/100/1000/150/2/20/200/250/3/30/300/4..., param=131/132/133/157, step=0/1/10/11/12/13/14/15/16/17/18/19/2/20/21/22/...│ └── levtype=sfc, type=fc
│ ├── param=31/34/78/134/136/137/151/165/166/167/168/302..., step=0/1/10/11/12/13/14/15/16/17/18/19/2/20/21/22/...│ ├── param=142/144/169/175/176/177/178/179/180/181/205/..., step=0-1/1-2/10-11/11-12/12-13/13-14/14-15/15-16/1...│ └── param=228058, step=0-6/12-18/18-24/24-30/30-36/36-42/42-48/48-54...└── date=2024-05-29T00:00, expver=0001, stream=oper, time=0000
├── levtype=hl, type=fc, levelist=100, param=228246/228247, step=0/1/10/11/12/13/14/15/16/17/18/19/2/20/21/22/...├── levtype=pl, type=fc
│ ├── levelist=1/10/100/1000/150/2/20/200/250/3/30/300/4..., param=129/130/131/132/133/157, step=0/1/10/11/12/13/14/15/16/17/18/19/2/20/21/22/... │ ├── levelist=5/7/70/700/850/925, param=129/130/131/132/133/157, step=0/1/10/11/12/13/14/15/16/17/18/19/2/20/21/22/...│ └── levelist=50
│ ├── param=129/130/131, step=0/1/10/11/12/13/14/15/16/17/18/19/2/20/21/22/... │ └── param=132/133/157, step=0/1/10/11/12/13/14/15/16/17/18/19/2/20/21/22/...└── levtype=sfc, type=fc
├── param=31/34/78/134/136/137/151/165/166/167/168/302..., step=0/1/10/11/12/13/14/15/16/17/18/19/2/20/21/22/... ├── param=142/144/169/175/176/177/178/179/180/181/205/..., step=0-1/1-2/10-11/11-12/12-13/13-14/14-15/15-16/1... └── param=228058, step=0-6/12-18/18-24/24-30/30-36/36-42/42-48/48-54...
When metadata is present, info prints information about the metadata also:
example.info()
This qube has
145 nodes
593.0 thousand individual leaves
In memory size of qube: 331.6 kB
--- Axes Info -------
| | Key | Dtypes | Depths | Example Value |
|---:|:---------|:---------|:---------|:--------------------|
| 0 | step | str | 10/11 | 59-60 |
| 1 | param | int64 | 9/10 | 129 |
| 2 | levelist | str | 9 | 300 |
| 3 | type | str | 8 | fc |
| 4 | levtype | str | 7 | pl |
| 5 | time | str | 6 | 0000 |
| 6 | stream | str | 5 | oper |
| 7 | expver | str | 4 | 0001 |
| 8 | date | datetime | 3 | 2024-04-28 00:00:00 |
| 9 | dataset | str | 2 | extremes-dt |
| 10 | class | str | 1 | d1 |
--- Metadata Info ---
| | Key | Dtypes | Depths | Total Size |
|---:|:-------|:--------------|:------------|:-------------|
| 0 | port | uint16 | 1 | 2 Bytes |
| 1 | scheme | StringDType() | 1 | 16 Bytes |
| 2 | offset | uint32/uint64 | 10/11 | 4.7 MB |
| 3 | length | uint32 | 10/11 | 2.4 MB |
| 4 | host | StringDType() | 3/7/9/10/11 | 1.2 MB |
| 5 | path | StringDType() | 3/7/9/10/11 | 1.2 MB |
Hovering over nodes will give some debug information about them and what metadata is attached. We can iterate over leaf nodes including their metadata using Qube.leaves_with_metadata()
next(example.leaves(metadata=True))
({'class': 'd1',
'dataset': 'extremes-dt',
'date': datetime.datetime(2024, 4, 4, 0, 0),
'expver': '0001',
'stream': 'oper',
'time': '0000',
'levtype': 'hl',
'type': 'fc',
'levelist': '100',
'param': 228246,
'step': '0'},
{'offset': 0,
'length': 57635255,
'host': 'databridge-prod-store2-ope.ewctest.link',
'path': '/data/prod_2/fdb/d1:extremes-dt:0001:oper:20240404:0000/fc:hl.20240501.115251.databridge-prod-store2.novalocal.10625568701677568.data',
'port': 10000,
'scheme': 'fdb'})
In this case we see that each individual field of this Qube stores a path to a file and an offset and length into that file. The path string is actually stored one level up the tree because it is common to many individual leaves.
We can print some helpful information about how the metadata appears in the qube:
example.metadata_info()
{'port': MetadataInfo(key='port', dtypes={dtype('uint16')}, depths={1}, total_bytes=2),
'scheme': MetadataInfo(key='scheme', dtypes={StringDType()}, depths={1}, total_bytes=16),
'offset': MetadataInfo(key='offset', dtypes={dtype('uint32'), dtype('uint64')}, depths={10, 11}, total_bytes=4741344),
'length': MetadataInfo(key='length', dtypes={dtype('uint32')}, depths={10, 11}, total_bytes=2371856),
'host': MetadataInfo(key='host', dtypes={StringDType()}, depths={3, 7, 9, 10, 11}, total_bytes=1155232),
'path': MetadataInfo(key='path', dtypes={StringDType()}, depths={3, 7, 9, 10, 11}, total_bytes=1155232)}
Building qubes with metadata
Currently the main to build qubes with metadata is leaf by leaf using union like this:
new_qube = Qube.empty()
for i, (id, metadata) in enumerate(example.leaves(metadata=True)):
new_qube |= Qube.from_datacube(id).add_metadata(metadata)
if i > 100: break
new_qube
root, class=d1, dataset=extremes-dt, date=2024-04-04T00:00, expver=0001, stream=oper, time=0000, levtype=hl, type=fc, levelist=100
├── param=228246, step=0/1/10/11/12/13/14/15/16/17/18/19/2/20/21/22/...└── param=228247, step=0/1/10/11/12
Here I’ve use an existing qube as a convenient source of metadata but you can equally do this from the output of an fdb list.
Modifying Qubes with metadata
For subselection, Qube.select works on qubes with metadata and will correctly slice the metadata along with the qube. To update existing metadata you can something like:
existing_qube = Qube.from_datacube(target).add_metadata(metadata) | existing qube
This will update the metadata for target to metadata because in the union the leftmost qube takes precendence. (Perhaps this should be changed to rightmost precedence!)
Recipes
Extracting the set of metadata values
In the case of metadata which sits at levels above the leaf nodes it would be ineficient to use Qube.leaves, instead one can use Qube.walk like this:
def get_metadata_key(qube, key):
m = []
def getter(qube):
for k, v in qube.metadata.items():
if k == key:
m.extend(v.flatten())
qube.walk(getter)
return m
m = get_metadata_key(example, "path")
m[:5]
['/data/prod_2/fdb/d1:extremes-dt:0001:oper:20240404:0000/fc:hl.20240501.115251.databridge-prod-store2.novalocal.10625568701677568.data',
'/data/prod_2/fdb/d1:extremes-dt:0001:oper:20240404:0000/fc:hl.20240501.115251.databridge-prod-store2.novalocal.10625568701677568.data',
'/data/prod_2/fdb/d1:extremes-dt:0001:oper:20240404:0000/fc:hl.20240501.115251.databridge-prod-store2.novalocal.10625568701677568.data',
'/data/prod_2/fdb/d1:extremes-dt:0001:oper:20240404:0000/fc:hl.20240501.115251.databridge-prod-store2.novalocal.10625568701677568.data',
'/data/prod_2/fdb/d1:extremes-dt:0001:oper:20240404:0000/fc:hl.20240501.115251.databridge-prod-store2.novalocal.10625568701677568.data']
Getting the total size in bytes used by metadata
from collections import defaultdict
def count_metadata_bytes(q: Qube):
totals = defaultdict(lambda: 0)
def measure(q: Qube):
for key, values in q.metadata.items():
totals[key] += values.nbytes
q.walk(measure)
return dict(totals)
# Requires the humanize library for nice formatting of bytes
def print_metadata_sizes(q):
totals = count_metadata_bytes(q)
for k, size in totals.items():
print(f"{k} : {humanize.naturalsize(size)}")
count_metadata_bytes(example)
{'port': 2,
'scheme': 16,
'offset': 4741344,
'length': 2371856,
'host': 1155232,
'path': 1155232}
Conversions
def convert_metadata_strings(q):
"""
Convert any metadata string arrays from the old style numpy fixed width U<N format to the new style StringDType format.
"""
return q.replace(
metadata = {k : v.astype(np.dtypes.StringDType()) if v.dtype.type is np.str_ else v
for k, v in q.metadata.items()}
)
def choose_smallest_int_container(qube):
"""
Go through all the int metadata blobs and compactify them to use the smallest container that will fit them without losing information.
"""
replace = {}
for k, v in qube.metadata.items():
if np.issubdtype(v.dtype, np.integer):
new_dtype = np.min_scalar_type(np.max(v))
if new_dtype != v.dtype:
# print(f"{k} going from {v.dtype} to {new_dtype}")
replace[k] = v.astype(new_dtype)
if replace:
new_metadata = qube.metadata | replace
qube = qube.replace(metadata = new_metadata)
return qube
# Usage
q = q.transform(convert_metadata_strings)
q = q.transform(choose_smallest_int_container)