__author__ = "Clea Parcerisas"
__version__ = "0.1"
__credits__ = "Clea Parcerisas"
__email__ = "clea.parcerisas@vliz.be"
__status__ = "Development"
import pathlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import xarray
from tqdm import tqdm
from pypam import acoustic_survey
from pypam import utils
[docs]class DataSet:
"""A DataSet object is a representation of a group of acoustic deployments.
It allows to calculate all the acoustic features from all the deployments and store them in a structured way
in the output folder. The structure is as follows:
- output_folder
- output_folder/deployments: a `*.nc` (netCDF) file for each deployment
- output_folder/detections: files resulting of the events detections. One folder per detection type
- output_folder/img: graphs and figures
- output_folder/img/temporal_features : temporal evolution of all features per deployment
- output_folder/img/data_overview : spatial and temporal coverage, and methods used
- output_folder/img/features_analysis : still to be discussed
- output_folder/img/spatial_features : spatial distribution of features
Parameters
----------
summary_path : string or Path
Path to the csv file where all the metadata of the deployments is
output_folder : string or Path
Where to save the output files (`*.nc`) of the deployments with the processed data
instruments : dictionary of (name, instrument_object) entries
A dictionary of all the instruments used in the deployments
temporal_features : list of strings
A list of all the features to be calculated
bands_list : list of tuples
A list of all the bands to consider (low_freq, high_freq)
frequency_features : list of strings
List of all the frequency features to compute
binsize : float
In seconds, duration of windows to consider
nfft : int
Number of samples of window to use for frequency analysis
"""
def __init__(self, summary_path, output_folder, instruments, temporal_features=None, frequency_features=None,
bands_list=None, binsize=60.0, bin_overlap=0.0, nfft=512, fft_overlap=0, dc_subtract=False):
self.metadata = pd.read_csv(summary_path)
if 'end_to_end_calibration' not in self.metadata.columns:
self.metadata['end_to_end_calibration'] = np.nan
self.summary_path = summary_path
self.instruments = instruments
self.temporal_features = temporal_features
self.frequency_features = frequency_features
self.band_list = bands_list
self.binsize = binsize
self.bin_overlap = bin_overlap
self.nfft = nfft
self.fft_overlap = fft_overlap
self.dc_subtract = dc_subtract
if not isinstance(output_folder, pathlib.Path):
output_folder = pathlib.Path(output_folder)
self.output_folder = output_folder
self.output_folder.joinpath('img/temporal_features').mkdir(parents=True, exist_ok=True)
self.output_folder.joinpath('img/data_overview').mkdir(parents=True, exist_ok=True)
self.output_folder.joinpath('img/features_analysis').mkdir(parents=True, exist_ok=True)
self.output_folder.joinpath('img/spatial_features').mkdir(parents=True, exist_ok=True)
self.output_folder.joinpath('deployments').mkdir(parents=True, exist_ok=True)
self.output_folder.joinpath('detections').mkdir(parents=True, exist_ok=True)
self.deployments_created = list(self.output_folder.joinpath('deployments').glob("*.nc"))
self.dataset = {}
self.survey_dependent_attrs = ['hydrophone_name', 'hydrophone_model', 'hydrophone_Vpp',
'hydrophone_preamp_gain', 'hydrophone_sensitivity']
def __call__(self):
"""
Calculates the acoustic features of every deployment and saves them as a pickle in the deployments folder with
the name of the station of the deployment.
Also adds all the deployment data to the self object in the general dataset,
and the path to each deployment's pickle in the list of deployments
"""
for idx, name, deployment_path in self.deployments():
self[idx]
self.metadata.to_csv(self.summary_path, index=False)
def __getitem__(self, i):
if i not in self.dataset.keys():
idx, deployment_name, deployment_path = self._deployment(i)
if deployment_path.exists():
deployment = xarray.open_dataset(deployment_path)
else:
deployment = self.generate_deployment(idx=idx)
deployment.to_netcdf(deployment_path)
self.dataset[idx] = deployment
else:
deployment = self.dataset[i]
return deployment
def _deployment(self, idx):
deployment_row = self.metadata.iloc[idx]
deployment_path = self.output_folder.joinpath('deployments/%s_%s.nc' %
(idx, deployment_row.deployment_name))
return idx, deployment_row['deployment_name'], deployment_path
def join_dataset(self):
ds = xarray.Dataset()
for idx, name, deployment_path, in self.deployments():
deployment = self[idx]
ds = utils.merge_ds(ds, deployment, self.survey_dependent_attrs)
return ds
def deployments(self):
"""
Iterates through all the deployments listed in the metadata file
Returns
-------
idx, deployment_name, deployment_path per iteration
"""
print('Reading dataset...')
for idx in tqdm(self.metadata.index, total=len(self.metadata)):
yield self._deployment(idx)
def generate_deployment(self, idx):
"""
Generate the deployment dataset for the index idx in the metadata file
Parameters
----------
idx: int
Index of the deployment in the dataset
Returns
-------
ds: xarray Dataset
"""
hydrophone = self.instruments[self.metadata.loc[(idx, 'instrument_name')]]
hydrophone.sensitivity = self.metadata.loc[(idx, 'instrument_sensitivity')]
hydrophone.preamp_gain = self.metadata.loc[(idx, 'instrument_amp')]
hydrophone.Vpp = self.metadata.loc[(idx, 'instrument_Vpp')]
survey_columns = ['folder_path', 'timezone', 'include_dirs', 'calibration']
extra_attrs = self.metadata.loc[(idx, self.metadata.columns[9::])].to_dict()
asa = acoustic_survey.ASA(hydrophone,
dc_subtract=self.dc_subtract,
binsize=self.binsize,
nfft=self.nfft,
fft_overlap=self.fft_overlap,
extra_attrs=extra_attrs,
**self.metadata.loc[(idx, survey_columns)].to_dict())
ds = xarray.Dataset()
if self.frequency_features not in [[], None]:
for f in self.frequency_features:
freq_evo = asa.evolution_freq_dom(f, band=None, db=True)
for data_var in freq_evo.data_vars:
ds = ds.merge(freq_evo[data_var])
if self.temporal_features not in [[], None]:
temporal_evo = asa.evolution_multiple(method_list=self.temporal_features, band_list=self.band_list)
for f in self.temporal_features:
ds = ds.merge(temporal_evo[f])
# Update the metadata in case the calibration changed the sensitivity
self.metadata.loc[idx, 'end_to_end_calibration'] = hydrophone.end_to_end_calibration(p_ref=1.0)
self.metadata.to_csv(self.summary_path, index=False)
return ds
def add_deployment_metadata(self, idx):
deployment_row = self.metadata.iloc[idx]
hydrophone = self.instruments[deployment_row['instrument_name']]
hydrophone.sensitivity = deployment_row['instrument_sensitivity']
hydrophone.preamp_gain = deployment_row['instrument_amp']
hydrophone.Vpp = deployment_row['instrument_Vpp']
asa = acoustic_survey.ASA(hydrophone=hydrophone, folder_path=deployment_row['folder_path'])
start, end = asa.start_end_timestamp()
duration = asa.duration()
self.metadata.iloc[idx, ['start', 'end', 'duration']] = start, end, duration
def add_temporal_metadata(self):
"""
Return a db with a data overview of the folder
"""
metadata_params = ['start_datetime', 'end_datetime', 'duration']
for m_p in metadata_params:
self.metadata[m_p] = None
for idx, _, _, in self.deployments():
self.add_deployment_metadata(idx)
return self.metadata
def plot_all_features_evo(self):
"""
Creates the images of the temporal evolution of all the features and saves them in the correspondent folder
"""
i = 0
for station_name, deployment in self.dataset.items():
for feature in self.temporal_features:
deployment[feature].plot()
plt.title('%s %s evolution' % (station_name, feature))
plt.savefig(
self.output_folder.joinpath('img/temporal_features/%s_%s_%s.png' % (i, station_name, feature)))
plt.show()
i += 1
def plot_third_octave_bands_prob(self, h=1.0, percentiles=None):
"""
Create a plot with the probability distribution of the levels of the third octave bands
Parameters
----------
h: float
Histogram bin size (in db)
percentiles: list of floats
Percentiles to plot (0 to 1). Default is 10, 50 and 90% ([0.1, 0.5, 0.9])
"""
if percentiles is None:
percentiles = []
percentiles = np.array(percentiles)
bin_edges = np.arange(start=self.dataset['oct3'].min().min(), stop=self.dataset['oct3'].max().max(), step=h)
fbands = self.dataset['oct3'].columns
station_i = 0
for station_name, deployment in self.dataset.items():
sxx = deployment['oct3'].values.T
spd = np.zeros((sxx.shape[0], bin_edges.size - 1))
p = np.zeros((sxx.shape[0], percentiles.size))
for i in np.arange(sxx.shape[0]):
spd[i, :] = np.histogram(sxx[i, :], bin_edges, density=True)[0]
cumsum = np.cumsum(spd[i, :])
for j in np.arange(percentiles.size):
p[i, j] = bin_edges[np.argmax(cumsum > percentiles[j] * cumsum[-1])]
fig = plt.figure()
im = plt.pcolormesh(fbands, bin_edges[:-1], spd.T, cmap='BuPu', shading='auto')
# Plot the lines of the percentiles
plt.plot(fbands, p, label=percentiles)
plt.xlabel('Frequency [Hz]')
plt.xscale('log')
plt.ylabel('$L_{rms}$ [dB]')
cbar = fig.colorbar(im)
cbar.set_label('Empirical Probability Density', rotation=90)
plt.title('1/3-octave bands probability distribution %s' % station_name)
plt.savefig(self.output_folder.joinpath('img/features_analysis/%s_%s_third_oct_prob.png' %
(station_i, station_name)))
plt.show()
station_i += 1
def plot_third_octave_bands_avg(self):
"""
Plot the average third octave bands per deployment
"""
station_i = 0
for station_name, deployment in self.dataset.items():
deployment['oct3'].mean(axis=0).plot()
plt.title('1/3-octave bands average %s' % station_name)
plt.xlabel('Frequency [Hz]')
plt.xscale('log')
plt.ylabel(r'Average Sound Level [dB re 1 $\mu Pa$]')
plt.savefig(self.output_folder.joinpath('img/features_analysis/%s_%s_third_oct_avg.png' %
(station_i, station_name)))
plt.show()
station_i += 1