Managing Datasets¶
Datasets organize multiple signals together, representing a complete data collection (like all sensors from a treatment plant).
Creating a Dataset¶
import numpy as np
import pandas as pd
from meteaudata import Signal, DataProvenance, Dataset
from meteaudata import resample, linear_interpolation, subset, replace_ranges
from meteaudata import average_signals
# Set random seed for reproducible examples
np.random.seed(42)
# Create multiple time series for complex examples
timestamps = pd.date_range('2024-01-01', periods=100, freq='h')
# Temperature data with daily cycle
temp_data = pd.Series(
20 + 5 * np.sin(np.arange(100) * 2 * np.pi / 24) + np.random.normal(0, 0.5, 100),
index=timestamps,
name="RAW"
)
# pH data with longer cycle
ph_data = pd.Series(
7.2 + 0.3 * np.sin(np.arange(100) * 2 * np.pi / 48) + np.random.normal(0, 0.1, 100),
index=timestamps,
name="RAW"
)
# Dissolved oxygen data with some correlation to temperature
do_data = pd.Series(
8.5 - 0.1 * (temp_data - 20) + np.random.normal(0, 0.2, 100),
index=timestamps,
name="RAW"
)
# Temperature signal
temp_provenance = DataProvenance(
source_repository="Plant SCADA",
project="Multi-parameter Monitoring",
location="Reactor R-101",
equipment="Thermocouple Type K",
parameter="Temperature",
purpose="Process monitoring",
metadata_id="temp_001"
)
temperature_signal = Signal(
input_data=temp_data,
name="Temperature",
provenance=temp_provenance,
units="°C"
)
# pH signal
ph_provenance = DataProvenance(
source_repository="Plant SCADA",
project="Multi-parameter Monitoring",
location="Reactor R-101",
equipment="pH Sensor v1.3",
parameter="pH",
purpose="Process monitoring",
metadata_id="ph_001"
)
ph_signal = Signal(
input_data=ph_data,
name="pH",
provenance=ph_provenance,
units="pH units"
)
# Dissolved oxygen signal
do_provenance = DataProvenance(
source_repository="Plant SCADA",
project="Multi-parameter Monitoring",
location="Reactor R-101",
equipment="DO Sensor v2.0",
parameter="Dissolved Oxygen",
purpose="Process monitoring",
metadata_id="do_001"
)
do_signal = Signal(
input_data=do_data,
name="DissolvedOxygen",
provenance=do_provenance,
units="mg/L"
)
# Create signals dictionary for easy access
signals = {
"temperature": temperature_signal,
"ph": ph_signal,
"dissolved_oxygen": do_signal
}
# Create a complete dataset
dataset = Dataset(
name="reactor_monitoring",
description="Multi-parameter monitoring of reactor R-101",
owner="Process Engineer",
purpose="Process control and optimization",
project="Process Monitoring Study",
signals={
"temperature": temperature_signal,
"ph": ph_signal,
"dissolved_oxygen": do_signal
}
)
print(f"Dataset: {dataset.name}")
print(f"Contains {len(dataset.signals)} signals:")
for name, signal in dataset.signals.items():
print(f" - {name}: {signal.name} ({signal.units})")
Dataset: reactor_monitoring
Contains 3 signals:
- Temperature#1: Temperature#1 (°C)
- pH#1: pH#1 (pH units)
- DissolvedOxygen#1: DissolvedOxygen#1 (mg/L)
Accessing Signals¶
# Get a specific signal using the actual key
signal_keys = list(dataset.signals.keys())
temp_signal = dataset.signals[signal_keys[0]] # Get first signal
print(f"Temperature signal: {temp_signal.name}")
print(f"Time series: {list(temp_signal.time_series.keys())}")
# Get signal data
temp_data = temp_signal.time_series["Temperature#1_RAW#1"].series
print(f"Temperature data points: {len(temp_data)}")
print(f"Sample values: {temp_data.head(3).values}")
Dataset Processing¶
# Apply processing to all signals
from meteaudata import linear_interpolation
# Process temperature signal
temp_signal.process(["Temperature#1_RAW#1"], linear_interpolation)
print(f"Processed temperature signal")
print(f"Temperature now has {len(temp_signal.time_series)} time series")
# Check what's available
print("Available time series:")
for signal_name, signal in dataset.signals.items():
ts_names = list(signal.time_series.keys())
print(f" {signal_name}: {ts_names}")
Available time series:
Temperature#1: ['Temperature#1_RAW#1', 'Temperature#1_LIN-INT#1']
pH#1: ['pH#1_RAW#1']
DissolvedOxygen#1: ['DissolvedOxygen#1_RAW#1']
Dataset Attributes¶
print(f"Dataset name: {dataset.name}")
print(f"Description: {dataset.description}")
print(f"Owner: {dataset.owner}")
print(f"Project: {dataset.project}")
print(f"Created: {dataset.created_on}")
print(f"Signal count: {len(dataset.signals)}")
Dataset name: reactor_monitoring
Description: Multi-parameter monitoring of reactor R-101
Owner: Process Engineer
Project: Process Monitoring Study
Created: 2025-12-10 19:11:08.261559
Signal count: 3
Custom Output Naming (v0.10.0+)¶
When processing datasets, you can assign custom names to output signals and time series:
from meteaudata import average_signals
# Create custom-named output signals
dataset.process(
input_signal_names=["Temperature#1", "Temperature#2"],
transform_function=average_signals,
output_signal_names=["TempAverage"]
)
print(f"Created signal: {list(dataset.signals.keys())[-1]}")
# Creates "TempAverage#1" instead of default naming
Custom Time Series Names¶
You can also customize time series names within the output signals:
# Custom signal and time series names
dataset.process(
input_signal_names=["Temperature#1", "Temperature#2"],
transform_function=average_signals,
output_signal_names=["SiteAverage"],
output_ts_names=["hourly"]
)
# Creates signal "SiteAverage#1" with time series "SiteAverage#1_hourly#1"
print(f"New signal: {list(dataset.signals.keys())[-1]}")
new_signal = dataset.signals["SiteAverage#1"]
print(f"Time series: {list(new_signal.time_series.keys())}")
Note: Custom names cannot contain underscores (reserved character). Use hyphens or other characters instead.
Overwrite Mode (v0.10.0+)¶
Re-run dataset processing without creating new versions:
# First run creates #1
dataset.process(
input_signal_names=["Temperature#1"],
transform_function=some_function,
output_signal_names=["Processed"]
)
print(f"First run: {list(dataset.signals.keys())[-1]}") # Processed#1
# Second run creates #2 by default
dataset.process(
input_signal_names=["Temperature#1"],
transform_function=some_function,
output_signal_names=["Processed"]
)
print(f"Second run: {list(dataset.signals.keys())[-1]}") # Processed#2
# With overwrite=True, replaces #2 (the latest) instead of creating #3
dataset.process(
input_signal_names=["Temperature#1"],
transform_function=some_function,
output_signal_names=["Processed"],
overwrite=True
)
print(f"With overwrite: {list(dataset.signals.keys())[-1]}") # Still Processed#2
See Also¶
- Working with Signals - Understanding individual signals
- Visualization - Plotting datasets and signals
- Saving and Loading - Persisting datasets