diff --git a/elephant/gpfa/gpfa.py b/elephant/gpfa/gpfa.py index 94d3fb70d..de983cbe0 100644 --- a/elephant/gpfa/gpfa.py +++ b/elephant/gpfa/gpfa.py @@ -2,7 +2,7 @@ Gaussian-process factor analysis (GPFA) is a dimensionality reduction method :cite:`gpfa-Yu2008_1881` for neural trajectory visualization of parallel spike trains. GPFA applies factor analysis (FA) to time-binned spike count data to -reduce the dimensionality and at the same time smoothes the resulting +reduce the dimensionality and at the same time smooth the resulting low-dimensional trajectories by fitting a Gaussian process (GP) model to them. The input consists of a set of trials (Y), each containing a list of spike @@ -20,18 +20,18 @@ Internally, the analysis consists of the following steps: -0) bin the spike train data to get a sequence of N dimensional vectors of spike -counts in respective time bins, and choose the reduced dimensionality x_dim +#. bin the spike train data to get a sequence of N dimensional vectors of spike + counts in respective time bins, and choose the reduced dimensionality x_dim. -1) expectation-maximization for fitting of the parameters C, d, R and the -time-scales and variances of the Gaussian process, using all the trials -provided as input (c.f., `gpfa_core.em()`) +#. expectation-maximization for fitting of the parameters C, d, R and the + time-scales and variances of the Gaussian process, using all the trials + provided as input (c.f., `gpfa_core.em()`). -2) projection of single trials in the low dimensional space (c.f., -`gpfa_core.exact_inference_with_ll()`) +#. projection of single trials in the low dimensional space (c.f., + `gpfa_core.exact_inference_with_ll()`). -3) orthonormalization of the matrix C and the corresponding subspace, for -visualization purposes: (c.f., `gpfa_core.orthonormalize()`) +#. orthonormalization of the matrix C and the corresponding subspace, for + visualization purposes: (c.f., `gpfa_core.orthonormalize()`). .. autosummary:: @@ -54,7 +54,7 @@ Run tutorial interactively: .. image:: https://mybinder.org/badge.svg - :target: https://mybinder.org/v2/gh/NeuralEnsemble/elephant/master + :target: https://mybinder.org/v2/gh/NeuralEnsemble/elephant/master ?filepath=doc/tutorials/gpfa.ipynb @@ -273,22 +273,32 @@ def binsize(self): warnings.warn("'binsize' is deprecated; use 'bin_size'") return self.bin_size - def fit(self, spiketrains): + def fit(self, trials): """ Fit the model with the given training data. Parameters ---------- - spiketrains : list of list of neo.SpikeTrain + trials : list of list of neo.SpikeTrain or np.recarray Spike train data to be fit to latent variables. The outer list corresponds to trials and the inner list corresponds to the neurons recorded in that trial, such that - `spiketrains[l][n]` is the spike train of neuron `n` in trial `l`. + `trials[l][n]` is the spike train of neuron `n` in trial `l`. Note that the number and order of `neo.SpikeTrain` objects per - trial must be fixed such that `spiketrains[l][n]` and - `spiketrains[k][n]` refer to spike trains of the same neuron + trial must be fixed such that `trials[l][n]` and + `trials[k][n]` refer to spike trains of the same neuron for any choices of `l`, `k`, and `n`. + Continuous data + Alternatively, pass a pre-processed np.recarray. + This is a training data structure, whose n-th element + (corresponding to the n-th experimental trial) has fields + + T : int + number of bins + y : (#units, T) np.ndarray + neural data + Returns ------- self : object @@ -297,14 +307,22 @@ def fit(self, spiketrains): Raises ------ ValueError - If `spiketrains` is an empty list. + If `trials` is an empty list. - If `spiketrains[0][0]` is not a `neo.SpikeTrain`. + If `trials[0][0]` is not a `neo.SpikeTrain`. If covariance matrix of input spike data is rank deficient. """ - self._check_training_data(spiketrains) - seqs_train = self._format_training_data(spiketrains) + + if isinstance(trials, list): + self._check_training_data(trials) + seqs_train = self._format_training_data(trials) + elif isinstance(trials, np.ndarray): + seqs_train = self._format_training_data_seqs(trials) + else: + raise ValueError('Must supply either trials as ' + 'np.recarray or List of Lists!') + # Check if training data covariance is full rank y_all = np.hstack(seqs_train['y']) y_dim = y_all.shape[0] @@ -339,9 +357,9 @@ def fit(self, spiketrains): @staticmethod def _check_training_data(spiketrains): if len(spiketrains) == 0: - raise ValueError("Input spiketrains cannot be empty") + raise ValueError("Input trials cannot be empty") if not isinstance(spiketrains[0][0], neo.SpikeTrain): - raise ValueError("structure of the spiketrains is not correct: " + raise ValueError("structure of the trials is not correct: " "0-axis should be trials, 1-axis neo.SpikeTrain" "and 2-axis spike times") @@ -353,7 +371,14 @@ def _format_training_data(self, spiketrains): seq['y'] = seq['y'][self.has_spikes_bool, :] return seqs - def transform(self, spiketrains, returned_data=['latent_variable_orth']): + def _format_training_data_seqs(self, seqs): + # Remove inactive units based on training set + self.has_spikes_bool = np.hstack(seqs['y']).any(axis=1) + for seq in seqs: + seq['y'] = seq['y'][self.has_spikes_bool, :] + return seqs + + def transform(self, trials, returned_data=('latent_variable_orth',)): """ Obtain trajectories of neural activity in a low-dimensional latent variable space by inferring the posterior mean of the obtained GPFA @@ -361,15 +386,26 @@ def transform(self, spiketrains, returned_data=['latent_variable_orth']): Parameters ---------- - spiketrains : list of list of neo.SpikeTrain + trials : list of list of neo.SpikeTrain or np.recarray Spike train data to be transformed to latent variables. The outer list corresponds to trials and the inner list corresponds to the neurons recorded in that trial, such that - `spiketrains[l][n]` is the spike train of neuron `n` in trial `l`. + `trials[l][n]` is the spike train of neuron `n` in trial `l`. Note that the number and order of `neo.SpikeTrain` objects per - trial must be fixed such that `spiketrains[l][n]` and - `spiketrains[k][n]` refer to spike trains of the same neuron + trial must be fixed such that `trials[l][n]` and + `trials[k][n]` refer to spike trains of the same neuron for any choices of `l`, `k`, and `n`. + + Continuous data + Alternatively, pass a pre-processed np.recarray. + This is a training data structure, whose n-th element + (corresponding to the n-th experimental trial) has fields + + T : int + number of bins + y : (#units, T) np.ndarray + neural data + returned_data : list of str The dimensionality reduction transform generates the following resultant data: @@ -415,25 +451,39 @@ def transform(self, spiketrains, returned_data=['latent_variable_orth']): `VsmGP`: (#bins, #bins, #latent_vars) np.ndarray Note that the num. of bins (#bins) can vary across trials, - reflecting the trial durations in the given `spiketrains` data. + reflecting the trial durations in the given `trials` data. Raises ------ ValueError - If the number of neurons in `spiketrains` is different from that + If the number of neurons in `trials` is different from that in the training spiketrain data. If `returned_data` contains keys different from the ones in `self.valid_data_names`. """ - if len(spiketrains[0]) != len(self.has_spikes_bool): - raise ValueError("'spiketrains' must contain the same number of " - "neurons as the training spiketrain data") + invalid_keys = set(returned_data).difference(self.valid_data_names) if len(invalid_keys) > 0: raise ValueError("'returned_data' can only have the following " "entries: {}".format(self.valid_data_names)) - seqs = gpfa_util.get_seqs(spiketrains, self.bin_size) + + if isinstance(trials,list): + if len(trials[0]) != len(self.has_spikes_bool): + raise ValueError("'trials' must contain the same number " + "of neurons as the training spiketrain data") + seqs = gpfa_util.get_seqs(trials, self.bin_size) + elif isinstance(trials,np.ndarray): + # check some stuff + if len(trials['y'][0]) != len(self.has_spikes_bool): + raise ValueError( + "'seq_trains' must contain the same number of neurons as " + "the training spiketrain data") + seqs=trials + else: + raise ValueError('Must supply either trials as ' + 'np.recarray or List of Lists!') + for seq in seqs: seq['y'] = seq['y'][self.has_spikes_bool, :] seqs, ll = gpfa_core.exact_inference_with_ll(seqs, @@ -447,15 +497,14 @@ def transform(self, spiketrains, returned_data=['latent_variable_orth']): return seqs[returned_data[0]] return {x: seqs[x] for x in returned_data} - def fit_transform(self, spiketrains, returned_data=[ - 'latent_variable_orth']): + def fit_transform(self, trials, returned_data=('latent_variable_orth',)): """ - Fit the model with `spiketrains` data and apply the dimensionality - reduction on `spiketrains`. + Fit the model with `trials` data and apply the dimensionality + reduction on `trials`. Parameters ---------- - spiketrains : list of list of neo.SpikeTrain + trials : list of list of neo.SpikeTrain or np.recarray Refer to the :func:`GPFA.fit` docstring. returned_data : list of str @@ -469,37 +518,42 @@ def fit_transform(self, spiketrains, returned_data=[ Raises ------ ValueError - Refer to :func:`GPFA.fit` and :func:`GPFA.transform`. + Refer to :func:`GPFA.fit` and :func:`GPFA.transform`. See Also -------- - GPFA.fit : fit the model with `spiketrains` - GPFA.transform : transform `spiketrains` into trajectories + GPFA.fit : fit the model with `trials` + GPFA.transform : transform `trials` into trajectories """ - self.fit(spiketrains) - return self.transform(spiketrains, returned_data=returned_data) - def score(self, spiketrains): + if isinstance(trials,(list,np.ndarray)): + self.fit(trials) + return self.transform(trials, returned_data=returned_data) + else: + raise ValueError('Must supply either trials as ' + 'np.recarray or List of Lists!') + + def score(self, trials): """ Returns the log-likelihood of the given data under the fitted model Parameters ---------- - spiketrains : list of list of neo.SpikeTrain + trials : list of list of neo.SpikeTrain Spike train data to be scored. The outer list corresponds to trials and the inner list corresponds to the neurons recorded in that trial, such that - `spiketrains[l][n]` is the spike train of neuron `n` in trial `l`. + `trials[l][n]` is the spike train of neuron `n` in trial `l`. Note that the number and order of `neo.SpikeTrain` objects per - trial must be fixed such that `spiketrains[l][n]` and - `spiketrains[k][n]` refer to spike trains of the same neuron + trial must be fixed such that `trials[l][n]` and + `trials[k][n]` refer to spike trains of the same neuron for any choice of `l`, `k`, and `n`. Returns ------- log_likelihood : float - Log-likelihood of the given spiketrains under the fitted model. + Log-likelihood of the given trials under the fitted model. """ - self.transform(spiketrains) + self.transform(trials) return self.transform_info['log_likelihood'] diff --git a/elephant/gpfa/gpfa_util.py b/elephant/gpfa/gpfa_util.py index 540c13f37..e91b32204 100644 --- a/elephant/gpfa/gpfa_util.py +++ b/elephant/gpfa/gpfa_util.py @@ -19,13 +19,13 @@ @deprecated_alias(binsize='bin_size') -def get_seqs(data, bin_size, use_sqrt=True): +def get_seqs(trials, bin_size, use_sqrt=True): """ Converts the data into a rec array using internally BinnedSpikeTrain. Parameters ---------- - data : list of list of neo.SpikeTrain + trials : list of list of neo.SpikeTrain The outer list corresponds to trials and the inner list corresponds to the neurons recorded in that trial, such that data[l][n] is the spike train of neuron n in trial l. Note that the number and order of @@ -36,7 +36,7 @@ def get_seqs(data, bin_size, use_sqrt=True): Spike bin width use_sqrt: bool - Boolean specifying whether or not to use square-root transform on + Boolean specifying whether to use square-root transform on spike counts (see original paper for motivation). Default: True @@ -58,17 +58,15 @@ def get_seqs(data, bin_size, use_sqrt=True): if not isinstance(bin_size, pq.Quantity): raise ValueError("'bin_size' must be of type pq.Quantity") - seqs = [] - for dat in data: - sts = dat - binned_spiketrain = BinnedSpikeTrain(sts, bin_size=bin_size) - if use_sqrt: - binned = np.sqrt(binned_spiketrain.to_array()) - else: - binned = binned_spiketrain.to_array() - seqs.append( - (binned_spiketrain.n_bins, binned)) - seqs = np.array(seqs, dtype=[('T', int), ('y', 'O')]) + # Create a np.recarray + # a generator that generates a tuple (n_bins, neural data) + seqs = ((BinnedSpikeTrain(trial, bin_size=bin_size).n_bins, + np.sqrt(BinnedSpikeTrain(trial, bin_size=bin_size).to_array()) + if use_sqrt + else BinnedSpikeTrain(trial, bin_size=bin_size).to_array()) + for trial in trials) + # create np.recarray by specifying dtype with np.array() + seqs = np.array(list(seqs), dtype=[('T', int), ('y', 'O')]) # Remove trials that are shorter than one bin width if len(seqs) > 0: @@ -88,12 +86,12 @@ def cut_trials(seq_in, seg_length=20): Parameters ---------- seq_in : np.recarray - data structure, whose nth entry (corresponding to the nth experimental + trials structure, whose nth entry (corresponding to the nth experimental trial) has fields T : int number of timesteps in trial y : (yDim, T) np.ndarray - neural data + neural trials seg_length : int length of segments to extract, in number of timesteps. If infinite, @@ -103,12 +101,12 @@ def cut_trials(seq_in, seg_length=20): Returns ------- seqOut : np.recarray - data structure, whose nth entry (corresponding to the nth experimental + trials structure, whose nth entry (corresponding to the nth experimental trial) has fields T : int number of timesteps in segment y : (yDim, T) np.ndarray - neural data + neural trials Raises ------ @@ -500,7 +498,7 @@ def orthonormalize(x, l): """ Orthonormalize the columns of the loading matrix and apply the corresponding linear transform to the latent variables. - In the following description, yDim and xDim refer to data dimensionality + In the following description, yDim and xDim refer to trials dimensionality and latent dimensionality, respectively. Parameters @@ -536,7 +534,7 @@ def orthonormalize(x, l): def segment_by_trial(seqs, x, fn): """ - Segment and store data by trial. + Segment and store trials by trial. Parameters ---------- diff --git a/elephant/test/test_gpfa.py b/elephant/test/test_gpfa.py index 19d35ae31..2d9c8e8af 100644 --- a/elephant/test/test_gpfa.py +++ b/elephant/test/test_gpfa.py @@ -13,7 +13,9 @@ import quantities as pq from numpy.testing import assert_array_equal, assert_array_almost_equal -from elephant.spike_train_generation import homogeneous_poisson_process +from elephant.spike_train_generation import StationaryPoissonProcess + + try: import sklearn @@ -30,10 +32,9 @@ class GPFATestCase(unittest.TestCase): def setUp(self): def gen_gamma_spike_train(k, theta, t_max): - x = [] - for i in range(int(3 * t_max / (k * theta))): - x.append(np.random.gamma(k, theta)) - s = np.cumsum(x) + x = (np.random.gamma(k, theta) for _ in + range(int(3 * t_max / (k * theta)))) + s = np.cumsum(list(x)) return s[s < t_max] def gen_test_data(rates, durs, shapes=(1, 1, 1, 1)): @@ -52,39 +53,42 @@ def gen_test_data(rates, durs, shapes=(1, 1, 1, 1)): durs = (2.5, 2.5, 2.5, 2.5) np.random.seed(0) n_trials = 100 - self.data0 = [] - for trial in range(n_trials): - n1 = neo.SpikeTrain(gen_test_data(rates_a, durs), units=1 * pq.s, - t_start=0 * pq.s, t_stop=10 * pq.s) - n2 = neo.SpikeTrain(gen_test_data(rates_a, durs), units=1 * pq.s, - t_start=0 * pq.s, t_stop=10 * pq.s) - n3 = neo.SpikeTrain(gen_test_data(rates_b, durs), units=1 * pq.s, - t_start=0 * pq.s, t_stop=10 * pq.s) - n4 = neo.SpikeTrain(gen_test_data(rates_b, durs), units=1 * pq.s, - t_start=0 * pq.s, t_stop=10 * pq.s) - n5 = neo.SpikeTrain(gen_test_data(rates_a, durs), units=1 * pq.s, - t_start=0 * pq.s, t_stop=10 * pq.s) - n6 = neo.SpikeTrain(gen_test_data(rates_a, durs), units=1 * pq.s, - t_start=0 * pq.s, t_stop=10 * pq.s) - n7 = neo.SpikeTrain(gen_test_data(rates_b, durs), units=1 * pq.s, - t_start=0 * pq.s, t_stop=10 * pq.s) - n8 = neo.SpikeTrain(gen_test_data(rates_b, durs), units=1 * pq.s, - t_start=0 * pq.s, t_stop=10 * pq.s) - self.data0.append([n1, n2, n3, n4, n5, n6, n7, n8]) + + # create 100 trials with each trial containing 8 spiketrains each + # 10 seconds long with rates a or b + spiketrain_10_sec = lambda rate_a_or_b: \ + neo.SpikeTrain(gen_test_data(rate_a_or_b, durs), units=1 * pq.s, + t_start=0 * pq.s, t_stop=10 * pq.s) + neurons = lambda : [ + spiketrain_10_sec(rates_a), # n1 + spiketrain_10_sec(rates_a), # n2 + spiketrain_10_sec(rates_b), # n3 + spiketrain_10_sec(rates_b), # n4 + spiketrain_10_sec(rates_a), # n5 + spiketrain_10_sec(rates_a), # n6 + spiketrain_10_sec(rates_b), # n7 + spiketrain_10_sec(rates_b), # n8 + ] + + self.data0 = [neurons() for _ in range(n_trials)] + self.x_dim = 4 self.data1 = self.data0[:20] # generate data2 np.random.seed(27) - self.data2 = [] n_trials = 10 n_channels = 20 - for trial in range(n_trials): - rates = np.random.randint(low=1, high=100, size=n_channels) - spike_times = [homogeneous_poisson_process(rate=rate * pq.Hz) - for rate in rates] - self.data2.append(spike_times) + rates = lambda : np.random.randint(low=1, high=100, size=n_channels) + + self.data2 =[[StationaryPoissonProcess(rate=rate * pq.Hz, + t_stop=1000*pq.ms).generate_spiketrain() + for rate in rates()] for _ in range(n_trials)] + + # generate seqs_train data + self.seqs_train = gpfa_util.get_seqs(self.data1, + bin_size=self.bin_size) def test_data1(self): gpfa = GPFA(x_dim=self.x_dim, em_max_iters=self.n_iters) @@ -131,9 +135,9 @@ def test_invalid_input_data(self): _ = GPFA(tau_init=invalid_tau_init) gpfa = GPFA() with self.assertRaises(ValueError): - gpfa.fit(spiketrains=invalid_data) + gpfa.fit(trials=invalid_data) with self.assertRaises(ValueError): - gpfa.fit(spiketrains=[]) + gpfa.fit(trials=[]) def test_data2(self): gpfa = GPFA(bin_size=self.bin_size, x_dim=8, em_max_iters=self.n_iters) @@ -161,7 +165,7 @@ def test_returned_data(self): self.assertTrue(len(returned_data) == len(seqs)) self.assertTrue(isinstance(seqs, dict)) with self.assertRaises(ValueError): - seqs = gpfa.transform(self.data2, returned_data=['invalid_name']) + gpfa.transform(self.data2, returned_data=['invalid_name']) def test_fit_transform(self): gpfa1 = GPFA(bin_size=self.bin_size, x_dim=self.x_dim, @@ -216,6 +220,38 @@ def test_logdet(self): logdet_ground_truth = np.log(np.linalg.det(matrix)) assert_array_almost_equal(logdet_fast, logdet_ground_truth) + def test_equality_spiketrains_seqs_train(self): + gpfa_seqs = GPFA(x_dim=self.x_dim, em_max_iters=self.n_iters) + gpfa_seqs.fit_transform(self.seqs_train) + + gpfa_spiketrains = GPFA(x_dim=self.x_dim, em_max_iters=self.n_iters) + gpfa_spiketrains.fit_transform(self.data1) + + self.assertAlmostEqual( + gpfa_seqs.transform_info['log_likelihood'], + gpfa_spiketrains.transform_info['log_likelihood'], places=5) + + def test_fit_seqs_train(self): + gpfa = GPFA(x_dim=self.x_dim, em_max_iters=self.n_iters) + gpfa.fit(self.seqs_train) + target = [0.07919926, 0.07546947, 0.07397758, 0.0780272, 0.07773862, + 0.07695879, 0.07469927, 0.07867749] + res = gpfa.params_estimated['d'] + assert_array_almost_equal(res, target, decimal=5) + + def test_transform_seqs_train(self): + gpfa = GPFA(x_dim=self.x_dim, em_max_iters=self.n_iters) + gpfa.fit(self.seqs_train) + gpfa.transform(self.seqs_train) + self.assertAlmostEqual(gpfa.transform_info['log_likelihood'], + -8172.004695554373, places=5) + + def test_fit_transform_seqs_train(self): + gpfa = GPFA(x_dim=self.x_dim, em_max_iters=self.n_iters) + gpfa.fit_transform(self.seqs_train) + self.assertAlmostEqual(gpfa.transform_info['log_likelihood'], + -8172.004695554373, places=5) + if __name__ == "__main__": unittest.main(verbosity=2)