eqcorrscan · calum-chamberlain · Jul 20, 2017 · Mar 22, 2017 · Apr 6, 2017 · May 20, 2017
diff --git a/.travis.yml b/.travis.yml
@@ -56,7 +56,7 @@ install:
         PYFLAKES="pyflakes=0.9.0"
       fi
   - echo $PYTHON_VERSION
-  - conda create -q -n test-environment python=$PYTHON_VERSION colorama numpy scipy matplotlib obspy flake8 mock coverage opencv3 bottleneck
+  - conda create -q -n test-environment python=$PYTHON_VERSION colorama numpy scipy matplotlib obspy flake8 mock coverage opencv3 bottleneck xarray
   - source activate test-environment
   - conda install $PYFLAKES
   - conda install pyproj

diff --git a/CHANGES.md b/CHANGES.md
@@ -56,6 +56,7 @@ will not be used.
 * Stop enforcing two-channel template channel names.
 * Fix bug in detection_multiplot which didn't allow streams with
 fewer traces than template;
+* Update internals to SciPy correlation rather than openCV (Major change);
 
 
 ## 0.1.6

diff --git a/appveyor.yml b/appveyor.yml
@@ -67,7 +67,8 @@ install:
 build: false
 
 test_script:
-  - "%CMD_IN_ENV% python setup.py develop"
+  - "%CMD_IN_ENV% python setup.py build"
+  - "%CMD_IN_ENV% pip install . --no-deps"
   - "%CMD_IN_ENV% py.test --ignore=eqcorrscan/tests/tutorial_test.py"
 
 after_test:

diff --git a/eqcorrscan/core/match_filter.py b/eqcorrscan/core/match_filter.py
@@ -41,6 +41,7 @@
 from obspy import Trace, Catalog, UTCDateTime, Stream, read, read_events
 from obspy.core.event import Event, Pick, CreationInfo, ResourceIdentifier
 from obspy.core.event import Comment, WaveformStreamID
+from scipy.fftpack import next_fast_len
 
 from eqcorrscan.utils.timer import Timer
 from eqcorrscan.utils.findpeaks import find_peaks2_short, decluster
@@ -51,6 +52,31 @@
 from eqcorrscan.core.lag_calc import lag_calc
 
 
+def _spike_test(stream, percent=0.99, multiplier=1e6):
+    """
+    Check for very large spikes in data and raise an error if found.
+
+    :param stream: Stream to look for spikes in.
+    :type stream: :class:`obspy.core.stream.Stream`
+    :param percent: Percentage as a decimal to calcualte range for.
+    :type percent: float
+    :param multiplier: Multiplier of range to define a spike.
+    :type multiplier: float
+    """
+    for tr in stream:
+        if (tr.data > 2 * np.max(
+            np.sort(np.abs(
+                tr))[0:int(percent * len(tr.data))]) * multiplier).sum() > 0:
+            msg = ('Spikes above ' + str(multiplier) +
+                   ' of the range of ' + str(percent) +
+                   ' of the data present, check. \n ' +
+                   'This would otherwise likely result in an issue during ' +
+                   'FFT prior to cross-correlation.\n' +
+                   'If you think this spike is real please report ' +
+                   'this as a bug.')
+            raise MatchFilterError(msg)
+
+
 class MatchFilterError(Exception):
     """
     Default error for match-filter errors.
@@ -3360,6 +3386,170 @@ def normxcorr2(template, image):
     return ccc
 
 
+def multi_normxcorr(templates, stream, pads):
+    """
+    Compute the normalized cross-correlation of multiple templates with data.
+    :param templates: 2D Array of templates
+    :type templates: np.ndarray
+    :param stream: 1D array of continuous data
+    :type stream: np.ndarray
+    :param pads: List of ints of pad lengths in the same order as templates
+    :type pads: list
+
+    :return: np.ndarray
+    """
+    # TODO:: Try other fft methods: pyfftw?
+    import bottleneck
+    from scipy.signal.signaltools import _centered
+    from scipy.fftpack.helper import next_fast_len
+
+    # Generate a template mask
+    used_chans = ~np.isnan(templates).any(axis=1)
+    # Currently have to use float64 as bottleneck runs into issues with other
+    # types: https://github.com/kwgoodman/bottleneck/issues/164
+    stream = stream.astype(np.float64)
+    templates = templates.astype(np.float64)
+    template_length = templates.shape[1]
+    stream_length = len(stream)
+    fftshape = next_fast_len(template_length + stream_length - 1)
+    # Set up normalizers
+    stream_mean_array = bottleneck.move_mean(
+        stream, template_length)[template_length - 1:]
+    stream_std_array = bottleneck.move_std(
+        stream, template_length)[template_length - 1:]
+    # Normalize and flip the templates
+    norm = ((templates - templates.mean(axis=-1, keepdims=True)) / (
+        templates.std(axis=-1, keepdims=True) * template_length))
+    norm_sum = norm.sum(axis=-1, keepdims=True)
+    stream_fft = np.fft.rfft(stream, fftshape)
+    template_fft = np.fft.rfft(np.flip(norm, axis=-1), fftshape, axis=-1)
+    res = np.fft.irfft(template_fft * stream_fft,
+                       fftshape)[:, 0:template_length + stream_length - 1]
+    res = ((_centered(res, stream_length - template_length + 1)) -
+           norm_sum * stream_mean_array) / stream_std_array
+    for i in range(len(pads)):
+        # This is a hack from padding templates with nan data
+        if np.isnan(res[i]).all():
+            res[i] = np.zeros(len(res[i]))
+        else:
+            res[i] = np.append(res[i], np.zeros(pads[i]))[pads[i]:]
+    return res.astype(np.float32), used_chans
+
+
+def multichannel_xcorr(templates, stream, use_dask=False, compute=True,
+                       cores=1):
+    """
+    Cross-correlate multiple channels either in parallel or not
+
+    :type templates: list
+    :param templates:
+        A list of templates, where each one should be an obspy.Stream object
+        containing multiple traces of seismic data and the relevant header
+        information.
+    :type stream: obspy.core.stream.Stream
+    :param stream:
+        A single Stream object to be correlated with the templates.  This is
+        in effect the image in normxcorr2 and cv2.
+    :type dask: bool
+    :param dask:
+        Whether to use dask for multiprocessing or not, if False, will use
+        python native multiprocessing.
+    :type compute: bool
+    :param compute:
+        Only valid if dask==True. If compute==False, returned result with be
+        a dask.delayed object, useful if you are using dask to compute multiple
+        time-steps at the same time.
+    :type cores: int
+    :param cores:
+        Number of processed to use, if set to None, and dask==False, no
+        multiprocessing will be done.
+    :type cores: int
+    :param cores: Number of cores to loop over
+
+    :returns:
+        New list of :class:`numpy.ndarray` objects.  These will contain
+        the correlation sums for each template for this day of data.
+    :rtype: list
+    :returns:
+        list of ints as number of channels used for each cross-correlation.
+    :rtype: list
+    :returns:
+        list of list of tuples of station, channel for all cross-correlations.
+    :rtype: list
+
+    .. Note::
+        Each template must contain the same channels as every other template,
+        the stream must also contain the same channels (note that if there
+        are duplicate channels in the template you do not need duplicate
+        channels in the stream).
+    """
+    no_chans = np.zeros(len(templates))
+    chans = [[] for _i in range(len(templates))]
+    # Do some reshaping
+    stream.sort(['network', 'station', 'location', 'channel'])
+    t_starts = []
+    for template in templates:
+        template.sort(['network', 'station', 'location', 'channel'])
+        t_starts.append(min([tr.stats.starttime for tr in template]))
+    seed_ids = [tr.id + '_' + str(i) for i, tr in enumerate(templates[0])]
+    template_array = {}
+    stream_array = {}
+    pad_array = {}
+    for i, seed_id in enumerate(seed_ids):
+        t_ar = np.array([template[i].data for template in templates])
+        template_array.update({seed_id: t_ar})
+        stream_array.update(
+            {seed_id: stream.select(id=seed_id.split('_')[0])[0].data})
+        pad_list = [
+            int(round(template[i].stats.sampling_rate *
+                      (template[i].stats.starttime - t_starts[j])))
+            for j, template in zip(range(len(templates)), templates)]
+        pad_array.update({seed_id: pad_list})
+    # if use_dask:
+    #     import dask
+    #     xcorrs = []
+    #     for seed_id in seed_ids:
+    #         tr_xcorrs, tr_chans = dask.delayed(multi_normxcorr)(
+    #             templates=template_array[seed_id],
+    #             stream=stream.select(id=seed_id.split('_')[0])[0].data)
+    #         xcorrs.append(tr_xcorrs)
+    #     cccsums = dask.delayed(np.sum)(xcorrs, axis=0)
+    #     if compute:
+    #         cccsums.compute()
+    if cores is None:
+        cccsums = np.zeros([len(templates),
+                            len(stream[0]) - len(templates[0][0]) + 1])
+        for seed_id in seed_ids:
+            tr_xcorrs, tr_chans = multi_normxcorr(
+                templates=template_array[seed_id],
+                stream=stream_array[seed_id], pads=pad_array[seed_id])
+            cccsums = np.sum([cccsums, tr_xcorrs], axis=0)
+            no_chans += tr_chans.astype(np.int)
+            for chan, state in zip(chans, tr_chans):
+                if state:
+                    chan.append((seed_id.split('.')[1],
+                                 seed_id.split('.')[-1].split('_')[0]))
+    else:
+        pool = Pool(processes=cores)
+        results = [pool.apply_async(
+            multi_normxcorr, (template_array[seed_id], stream_array[seed_id],
+                              pad_array[seed_id]))
+                   for seed_id in seed_ids]
+        pool.close()
+        results = [p.get() for p in results]
+        xcorrs = [p[0] for p in results]
+        tr_chans = np.array([p[1] for p in results])
+        pool.join()
+        cccsums = np.sum(xcorrs, axis=0)
+        no_chans = np.sum(tr_chans.astype(np.int), axis=0)
+        for seed_id, tr_chan in zip(seed_ids, tr_chans):
+            for chan, state in zip(chans, tr_chan):
+                if state:
+                    chan.append((seed_id.split('.')[1],
+                                 seed_id.split('.')[-1].split('_')[0]))
+    return cccsums, no_chans, chans
+
+
 def _template_loop(template, chan, stream_ind, debug=0, i=0):
     """
     Handle individual template correlations.
@@ -3421,7 +3611,7 @@ def _template_loop(template, chan, stream_ind, debug=0, i=0):
     return i, ccc
 
 
-def _channel_loop(templates, stream, cores=1, debug=0, internal=True):
+def _channel_loop(templates, stream, cores=1, debug=0):
     """
     Internal loop for parallel processing.
 
@@ -3442,10 +3632,6 @@ def _channel_loop(templates, stream, cores=1, debug=0, internal=True):
     :param cores: Number of cores to loop over
     :type debug: int
     :param debug: Debug level.
-    :type internal: bool
-    :param internal:
-        Whether to use the internal Python code (True) or the experimental
-        compilled code.
 
     :returns:
         New list of :class:`numpy.ndarray` objects.  These will contain
@@ -3464,8 +3650,6 @@ def _channel_loop(templates, stream, cores=1, debug=0, internal=True):
         are duplicate channels in the template you do not need duplicate
         channels in the stream).
     """
-    if not internal:
-        print('Not yet coded')
     num_cores = cores
     if len(templates) < num_cores:
         num_cores = len(templates)
@@ -3809,7 +3993,7 @@ def match_filter(template_names, template_list, st, threshold,
             raise MatchFilterError(msg)
     outtic = time.clock()
     if debug >= 2:
-        print('Ensuring all template channels have matches in long data')
+        print('Ensuring all template channels have matches in continuous data')
     template_stachan = {}
     # Work out what station-channel pairs are in the templates, including
     # duplicate station-channel pairs.  We will use this information to fill
@@ -3908,9 +4092,8 @@ def match_filter(template_names, template_list, st, threshold,
         for template in templates:
             print(template)
         print(stream)
-    [cccsums, no_chans, chans] = _channel_loop(
-        templates=templates, stream=stream, cores=cores, debug=debug,
-        internal=internal)
+    [cccsums, no_chans, chans] = multichannel_xcorr(
+        templates=templates, stream=stream, cores=cores)
     if len(cccsums[0]) == 0:
         raise MatchFilterError('Correlation has not run, zero length cccsum')
     outtoc = time.clock()
@@ -4029,31 +4212,6 @@ def match_filter(template_names, template_list, st, threshold,
         return detections, det_cat, detection_streams
 
 
-def _spike_test(stream, percent=0.99, multiplier=1e6):
-    """
-    Check for very large spikes in data and raise an error if found.
-
-    :param stream: Stream to look for spikes in.
-    :type stream: :class:`obspy.core.stream.Stream`
-    :param percent: Percentage as a decimal to calcualte range for.
-    :type percent: float
-    :param multiple: Multiplier of range to define a spike.
-    :type multiple: float
-    """
-    for tr in stream:
-        if (tr.data > 2 * np.max(
-            np.sort(np.abs(
-                tr))[0:int(percent * len(tr.data))]) * multiplier).sum() > 0:
-            msg = ('Spikes above ' + str(multiplier) +
-                   ' of the range of ' + str(percent) +
-                   ' of the data present, check. \n ' +
-                   'This would otherwise likely result in an issue during ' +
-                   'FFT prior to cross-correlation.\n' +
-                   'If you think this spike is real please report ' +
-                   'this as a bug.')
-            raise MatchFilterError(msg)
-
-
 if __name__ == "__main__":
     import doctest
     doctest.testmod()

diff --git a/eqcorrscan/tests/match_filter_test.py b/eqcorrscan/tests/match_filter_test.py
@@ -341,7 +341,7 @@ def setUpClass(cls):
                     (tr.stats.network, tr.stats.station, tr.stats.channel))
         template_stachans = list(set(template_stachans))
         bulk_info = [(stachan[0], stachan[1], '*', stachan[2],
-                      t1, t1 + process_len)
+                      t1, t1 + process_len + 1)
                      for stachan in template_stachans]
         # Just downloading an hour of data
         print('Downloading continuous data')
@@ -645,8 +645,13 @@ def test_tribe_detect(self):
                 for key in det.__dict__.keys():
                     if key == 'event':
                         continue
-                    self.assertEqual(det.__dict__[key],
-                                     check_det.__dict__[key])
+                    if isinstance(det.__dict__[key], float):
+                        self.assertAlmostEqual(
+                            det.__dict__[key], check_det.__dict__[key],
+                            places=2)
+                    else:
+                        self.assertEqual(
+                            det.__dict__[key], check_det.__dict__[key])
             # self.assertEqual(fam.template, check_fam.template)
 
     def test_client_detect(self):

diff --git a/setup.py b/setup.py
@@ -72,21 +72,20 @@
     if not READ_THE_DOCS:
         install_requires = ['numpy>=1.8.0', 'obspy>=1.0.0',
                             'matplotlib>=1.3.0', 'joblib>=0.8.4',
-                            'scipy>=0.14', 'multiprocessing',
-                            'LatLon', 'h5py', 'cython', 'bottleneck']
+                            'scipy>=0.18', 'LatLon', 'cython', 
+                            'bottleneck', 'xarray']
     else:
         install_requires = ['numpy>=1.8.0', 'obspy>=1.0.0',
                             'matplotlib>=1.3.0', 'joblib>=0.8.4',
-                            'multiprocessing',
                             'LatLon']
 else:
     if not READ_THE_DOCS:
-        install_requires = ['numpy>=1.8.0', 'obspy>=0.10.2',
+        install_requires = ['numpy>=1.8.0', 'obspy>=1.0.0',
                             'matplotlib>=1.3.0', 'joblib>=0.8.4',
-                            'scipy>=0.14', 'LatLon', 'h5py', 'cython',
-                            'bottleneck']
+                            'scipy>=0.18', 'LatLon', 'cython',
+                            'bottleneck', 'xarray']
     else:
-        install_requires = ['numpy>=1.8.0', 'obspy>=0.10.2',
+        install_requires = ['numpy>=1.8.0', 'obspy>=1.0.0',
                             'matplotlib>=1.3.0', 'joblib>=0.8.4',
                             'LatLon']
 # install_requires.append('ConfigParser')