Source code for pypam.dataset

__author__ = "Clea Parcerisas"
__version__ = "0.1"
__credits__ = "Clea Parcerisas"
__email__ = "clea.parcerisas@vliz.be"
__status__ = "Development"

import pathlib

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import xarray
from tqdm import tqdm

from pypam import acoustic_survey
from pypam import utils


[docs]class DataSet: """A DataSet object is a representation of a group of acoustic deployments. It allows to calculate all the acoustic features from all the deployments and store them in a structured way in the output folder. The structure is as follows: - output_folder - output_folder/deployments: a `*.nc` (netCDF) file for each deployment - output_folder/detections: files resulting of the events detections. One folder per detection type - output_folder/img: graphs and figures - output_folder/img/temporal_features : temporal evolution of all features per deployment - output_folder/img/data_overview : spatial and temporal coverage, and methods used - output_folder/img/features_analysis : still to be discussed - output_folder/img/spatial_features : spatial distribution of features Parameters ---------- summary_path : string or Path Path to the csv file where all the metadata of the deployments is output_folder : string or Path Where to save the output files (`*.nc`) of the deployments with the processed data instruments : dictionary of (name, instrument_object) entries A dictionary of all the instruments used in the deployments temporal_features : list of strings A list of all the features to be calculated bands_list : list of tuples A list of all the bands to consider (low_freq, high_freq) frequency_features : list of strings List of all the frequency features to compute binsize : float In seconds, duration of windows to consider nfft : int Number of samples of window to use for frequency analysis """ def __init__(self, summary_path, output_folder, instruments, temporal_features=None, frequency_features=None, bands_list=None, binsize=60.0, bin_overlap=0.0, nfft=512, fft_overlap=0, dc_subtract=False): self.metadata = pd.read_csv(summary_path) if 'end_to_end_calibration' not in self.metadata.columns: self.metadata['end_to_end_calibration'] = np.nan self.summary_path = summary_path self.instruments = instruments self.temporal_features = temporal_features self.frequency_features = frequency_features self.band_list = bands_list self.binsize = binsize self.bin_overlap = bin_overlap self.nfft = nfft self.fft_overlap = fft_overlap self.dc_subtract = dc_subtract if not isinstance(output_folder, pathlib.Path): output_folder = pathlib.Path(output_folder) self.output_folder = output_folder self.output_folder.joinpath('img/temporal_features').mkdir(parents=True, exist_ok=True) self.output_folder.joinpath('img/data_overview').mkdir(parents=True, exist_ok=True) self.output_folder.joinpath('img/features_analysis').mkdir(parents=True, exist_ok=True) self.output_folder.joinpath('img/spatial_features').mkdir(parents=True, exist_ok=True) self.output_folder.joinpath('deployments').mkdir(parents=True, exist_ok=True) self.output_folder.joinpath('detections').mkdir(parents=True, exist_ok=True) self.deployments_created = list(self.output_folder.joinpath('deployments').glob("*.nc")) self.dataset = {} self.survey_dependent_attrs = ['hydrophone_name', 'hydrophone_model', 'hydrophone_Vpp', 'hydrophone_preamp_gain', 'hydrophone_sensitivity'] def __call__(self): """ Calculates the acoustic features of every deployment and saves them as a pickle in the deployments folder with the name of the station of the deployment. Also adds all the deployment data to the self object in the general dataset, and the path to each deployment's pickle in the list of deployments """ for idx, name, deployment_path in self.deployments(): self[idx] self.metadata.to_csv(self.summary_path, index=False) def __getitem__(self, i): if i not in self.dataset.keys(): idx, deployment_name, deployment_path = self._deployment(i) if deployment_path.exists(): deployment = xarray.open_dataset(deployment_path) else: deployment = self.generate_deployment(idx=idx) deployment.to_netcdf(deployment_path) self.dataset[idx] = deployment else: deployment = self.dataset[i] return deployment def _deployment(self, idx): deployment_row = self.metadata.iloc[idx] deployment_path = self.output_folder.joinpath('deployments/%s_%s.nc' % (idx, deployment_row.deployment_name)) return idx, deployment_row['deployment_name'], deployment_path def join_dataset(self): ds = xarray.Dataset() for idx, name, deployment_path, in self.deployments(): deployment = self[idx] ds = utils.merge_ds(ds, deployment, self.survey_dependent_attrs) return ds def deployments(self): """ Iterates through all the deployments listed in the metadata file Returns ------- idx, deployment_name, deployment_path per iteration """ print('Reading dataset...') for idx in tqdm(self.metadata.index, total=len(self.metadata)): yield self._deployment(idx) def generate_deployment(self, idx): """ Generate the deployment dataset for the index idx in the metadata file Parameters ---------- idx: int Index of the deployment in the dataset Returns ------- ds: xarray Dataset """ hydrophone = self.instruments[self.metadata.loc[(idx, 'instrument_name')]] hydrophone.sensitivity = self.metadata.loc[(idx, 'instrument_sensitivity')] hydrophone.preamp_gain = self.metadata.loc[(idx, 'instrument_amp')] hydrophone.Vpp = self.metadata.loc[(idx, 'instrument_Vpp')] survey_columns = ['folder_path', 'timezone', 'include_dirs', 'calibration'] extra_attrs = self.metadata.loc[(idx, self.metadata.columns[9::])].to_dict() asa = acoustic_survey.ASA(hydrophone, dc_subtract=self.dc_subtract, binsize=self.binsize, nfft=self.nfft, fft_overlap=self.fft_overlap, extra_attrs=extra_attrs, **self.metadata.loc[(idx, survey_columns)].to_dict()) ds = xarray.Dataset() if self.frequency_features not in [[], None]: for f in self.frequency_features: freq_evo = asa.evolution_freq_dom(f, band=None, db=True) for data_var in freq_evo.data_vars: ds = ds.merge(freq_evo[data_var]) if self.temporal_features not in [[], None]: temporal_evo = asa.evolution_multiple(method_list=self.temporal_features, band_list=self.band_list) for f in self.temporal_features: ds = ds.merge(temporal_evo[f]) # Update the metadata in case the calibration changed the sensitivity self.metadata.loc[idx, 'end_to_end_calibration'] = hydrophone.end_to_end_calibration(p_ref=1.0) self.metadata.to_csv(self.summary_path, index=False) return ds def add_deployment_metadata(self, idx): deployment_row = self.metadata.iloc[idx] hydrophone = self.instruments[deployment_row['instrument_name']] hydrophone.sensitivity = deployment_row['instrument_sensitivity'] hydrophone.preamp_gain = deployment_row['instrument_amp'] hydrophone.Vpp = deployment_row['instrument_Vpp'] asa = acoustic_survey.ASA(hydrophone=hydrophone, folder_path=deployment_row['folder_path']) start, end = asa.start_end_timestamp() duration = asa.duration() self.metadata.iloc[idx, ['start', 'end', 'duration']] = start, end, duration def add_temporal_metadata(self): """ Return a db with a data overview of the folder """ metadata_params = ['start_datetime', 'end_datetime', 'duration'] for m_p in metadata_params: self.metadata[m_p] = None for idx, _, _, in self.deployments(): self.add_deployment_metadata(idx) return self.metadata def plot_all_features_evo(self): """ Creates the images of the temporal evolution of all the features and saves them in the correspondent folder """ i = 0 for station_name, deployment in self.dataset.items(): for feature in self.temporal_features: deployment[feature].plot() plt.title('%s %s evolution' % (station_name, feature)) plt.savefig( self.output_folder.joinpath('img/temporal_features/%s_%s_%s.png' % (i, station_name, feature))) plt.show() i += 1 def plot_third_octave_bands_prob(self, h=1.0, percentiles=None): """ Create a plot with the probability distribution of the levels of the third octave bands Parameters ---------- h: float Histogram bin size (in db) percentiles: list of floats Percentiles to plot (0 to 1). Default is 10, 50 and 90% ([0.1, 0.5, 0.9]) """ if percentiles is None: percentiles = [] percentiles = np.array(percentiles) bin_edges = np.arange(start=self.dataset['oct3'].min().min(), stop=self.dataset['oct3'].max().max(), step=h) fbands = self.dataset['oct3'].columns station_i = 0 for station_name, deployment in self.dataset.items(): sxx = deployment['oct3'].values.T spd = np.zeros((sxx.shape[0], bin_edges.size - 1)) p = np.zeros((sxx.shape[0], percentiles.size)) for i in np.arange(sxx.shape[0]): spd[i, :] = np.histogram(sxx[i, :], bin_edges, density=True)[0] cumsum = np.cumsum(spd[i, :]) for j in np.arange(percentiles.size): p[i, j] = bin_edges[np.argmax(cumsum > percentiles[j] * cumsum[-1])] fig = plt.figure() im = plt.pcolormesh(fbands, bin_edges[:-1], spd.T, cmap='BuPu', shading='auto') # Plot the lines of the percentiles plt.plot(fbands, p, label=percentiles) plt.xlabel('Frequency [Hz]') plt.xscale('log') plt.ylabel('$L_{rms}$ [dB]') cbar = fig.colorbar(im) cbar.set_label('Empirical Probability Density', rotation=90) plt.title('1/3-octave bands probability distribution %s' % station_name) plt.savefig(self.output_folder.joinpath('img/features_analysis/%s_%s_third_oct_prob.png' % (station_i, station_name))) plt.show() station_i += 1 def plot_third_octave_bands_avg(self): """ Plot the average third octave bands per deployment """ station_i = 0 for station_name, deployment in self.dataset.items(): deployment['oct3'].mean(axis=0).plot() plt.title('1/3-octave bands average %s' % station_name) plt.xlabel('Frequency [Hz]') plt.xscale('log') plt.ylabel(r'Average Sound Level [dB re 1 $\mu Pa$]') plt.savefig(self.output_folder.joinpath('img/features_analysis/%s_%s_third_oct_avg.png' % (station_i, station_name))) plt.show() station_i += 1